{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 100, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021376085504342017, "grad_norm": 4.503899550790205, "learning_rate": 2.127659574468085e-08, "logits/chosen": -0.8003637194633484, "logits/rejected": -0.8448871970176697, "logps/chosen": -212.04685974121094, "logps/rejected": -206.4463348388672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0042752171008684035, "grad_norm": 4.89256031461174, "learning_rate": 4.25531914893617e-08, "logits/chosen": -0.750135064125061, "logits/rejected": -0.7247368097305298, "logps/chosen": -271.5355529785156, "logps/rejected": -260.5343322753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.006412825651302605, "grad_norm": 4.511049028695194, "learning_rate": 6.382978723404254e-08, "logits/chosen": -0.9132480621337891, "logits/rejected": -0.9213609099388123, "logps/chosen": -259.10791015625, "logps/rejected": -262.6512756347656, "loss": 0.6935, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0005805277032777667, "rewards/margins": -0.001751818461343646, "rewards/rejected": 0.0011712908744812012, "step": 3 }, { "epoch": 0.008550434201736807, "grad_norm": 5.0258481504448485, "learning_rate": 8.51063829787234e-08, "logits/chosen": -0.8424134850502014, "logits/rejected": -0.8080853223800659, "logps/chosen": -251.00387573242188, "logps/rejected": -255.1189422607422, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0018655203748494387, "rewards/margins": 0.0010831927647814155, "rewards/rejected": 0.0007823276100680232, "step": 4 }, { "epoch": 0.01068804275217101, "grad_norm": 4.75851133644133, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.9411681294441223, "logits/rejected": -0.9376619458198547, "logps/chosen": -289.8980407714844, "logps/rejected": -274.7005615234375, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0026531934272497892, "rewards/margins": 0.0023759508039802313, "rewards/rejected": 0.00027724262326955795, "step": 5 }, { "epoch": 0.01282565130260521, "grad_norm": 4.443327602655402, "learning_rate": 1.2765957446808508e-07, "logits/chosen": -0.7161233425140381, "logits/rejected": -0.6978777647018433, "logps/chosen": -223.0089569091797, "logps/rejected": -222.1771240234375, "loss": 0.6934, "rewards/accuracies": 0.34375, "rewards/chosen": -0.0012396120000630617, "rewards/margins": -0.0016972327139228582, "rewards/rejected": 0.00045762062654830515, "step": 6 }, { "epoch": 0.014963259853039413, "grad_norm": 5.506063836746189, "learning_rate": 1.4893617021276595e-07, "logits/chosen": -0.9607124924659729, "logits/rejected": -0.9491544961929321, "logps/chosen": -310.2432556152344, "logps/rejected": -305.9755554199219, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0012061572633683681, "rewards/margins": -0.0002907347516156733, "rewards/rejected": 0.0014968919567763805, "step": 7 }, { "epoch": 0.017100868403473614, "grad_norm": 4.851635423100062, "learning_rate": 1.702127659574468e-07, "logits/chosen": -0.8928542137145996, "logits/rejected": -0.8853560090065002, "logps/chosen": -247.1142120361328, "logps/rejected": -244.08663940429688, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0002220773312728852, "rewards/margins": 0.0013035106239840388, "rewards/rejected": -0.0010814334964379668, "step": 8 }, { "epoch": 0.019238476953907815, "grad_norm": 4.87939101936585, "learning_rate": 1.9148936170212765e-07, "logits/chosen": -0.8140461444854736, "logits/rejected": -0.8076512813568115, "logps/chosen": -272.2711486816406, "logps/rejected": -284.1283264160156, "loss": 0.6935, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0010817217407748103, "rewards/margins": -0.001508195186033845, "rewards/rejected": 0.0004264736198820174, "step": 9 }, { "epoch": 0.02137608550434202, "grad_norm": 4.562355516566984, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.8849949836730957, "logits/rejected": -0.8811756372451782, "logps/chosen": -229.57052612304688, "logps/rejected": -231.6889190673828, "loss": 0.6931, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00039355267654173076, "rewards/margins": 0.0002220940077677369, "rewards/rejected": 0.0001714585960144177, "step": 10 }, { "epoch": 0.02351369405477622, "grad_norm": 4.67288441235731, "learning_rate": 2.3404255319148937e-07, "logits/chosen": -0.8189717531204224, "logits/rejected": -0.8200615644454956, "logps/chosen": -273.552734375, "logps/rejected": -277.36859130859375, "loss": 0.6934, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0009033679380081594, "rewards/margins": -0.0013172316830605268, "rewards/rejected": 0.0004138636286370456, "step": 11 }, { "epoch": 0.02565130260521042, "grad_norm": 4.805681101367893, "learning_rate": 2.5531914893617016e-07, "logits/chosen": -0.9043698906898499, "logits/rejected": -0.8993241190910339, "logps/chosen": -273.664306640625, "logps/rejected": -268.0246887207031, "loss": 0.6929, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0001288223429583013, "rewards/margins": -3.8141035474836826e-05, "rewards/rejected": 0.00016696332022547722, "step": 12 }, { "epoch": 0.02778891115564462, "grad_norm": 4.91733558840618, "learning_rate": 2.7659574468085106e-07, "logits/chosen": -0.8745774626731873, "logits/rejected": -0.8446710705757141, "logps/chosen": -243.00827026367188, "logps/rejected": -229.5283203125, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.002203774405643344, "rewards/margins": -0.00030036212410777807, "rewards/rejected": -0.0019034123979508877, "step": 13 }, { "epoch": 0.029926519706078826, "grad_norm": 5.299324976103458, "learning_rate": 2.978723404255319e-07, "logits/chosen": -0.7348307967185974, "logits/rejected": -0.7354189157485962, "logps/chosen": -186.85391235351562, "logps/rejected": -199.67623901367188, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0016946769319474697, "rewards/margins": -0.0006936597637832165, "rewards/rejected": -0.001001017284579575, "step": 14 }, { "epoch": 0.03206412825651302, "grad_norm": 4.755602904170831, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.7406636476516724, "logits/rejected": -0.7166301608085632, "logps/chosen": -199.6678466796875, "logps/rejected": -194.37559509277344, "loss": 0.6928, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0003121185291092843, "rewards/margins": 0.0012607788667082787, "rewards/rejected": -0.0009486603084951639, "step": 15 }, { "epoch": 0.03420173680694723, "grad_norm": 4.853620806434979, "learning_rate": 3.404255319148936e-07, "logits/chosen": -0.78841632604599, "logits/rejected": -0.7843498587608337, "logps/chosen": -266.4180908203125, "logps/rejected": -271.6226806640625, "loss": 0.693, "rewards/accuracies": 0.625, "rewards/chosen": -0.0009592341957613826, "rewards/margins": 0.0019549226853996515, "rewards/rejected": -0.002914156997576356, "step": 16 }, { "epoch": 0.03633934535738143, "grad_norm": 5.423827110862174, "learning_rate": 3.617021276595745e-07, "logits/chosen": -0.9736945629119873, "logits/rejected": -0.9769234657287598, "logps/chosen": -258.8900146484375, "logps/rejected": -264.2679748535156, "loss": 0.6935, "rewards/accuracies": 0.25, "rewards/chosen": -0.0061743613332509995, "rewards/margins": -0.001890932791866362, "rewards/rejected": -0.0042834291234612465, "step": 17 }, { "epoch": 0.03847695390781563, "grad_norm": 4.824497254280432, "learning_rate": 3.829787234042553e-07, "logits/chosen": -0.851763904094696, "logits/rejected": -0.8533320426940918, "logps/chosen": -273.1241760253906, "logps/rejected": -269.42315673828125, "loss": 0.6927, "rewards/accuracies": 0.375, "rewards/chosen": -0.0023198507260531187, "rewards/margins": -0.0010572766186669469, "rewards/rejected": -0.0012625741073861718, "step": 18 }, { "epoch": 0.040614562458249834, "grad_norm": 4.885682499438778, "learning_rate": 4.0425531914893614e-07, "logits/chosen": -0.9122135043144226, "logits/rejected": -0.9140520095825195, "logps/chosen": -336.9332580566406, "logps/rejected": -327.79571533203125, "loss": 0.6932, "rewards/accuracies": 0.375, "rewards/chosen": -0.007231764495372772, "rewards/margins": -0.00179797422606498, "rewards/rejected": -0.005433791317045689, "step": 19 }, { "epoch": 0.04275217100868404, "grad_norm": 4.403742601709981, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.8458749055862427, "logits/rejected": -0.8761993646621704, "logps/chosen": -258.8704833984375, "logps/rejected": -263.5494079589844, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.005775632336735725, "rewards/margins": 0.00025997147895395756, "rewards/rejected": -0.006035604514181614, "step": 20 }, { "epoch": 0.044889779559118236, "grad_norm": 5.1980600006783195, "learning_rate": 4.4680851063829783e-07, "logits/chosen": -0.7707018852233887, "logits/rejected": -0.7247700691223145, "logps/chosen": -233.66183471679688, "logps/rejected": -255.91018676757812, "loss": 0.6924, "rewards/accuracies": 0.46875, "rewards/chosen": -0.006643190514296293, "rewards/margins": 0.0013489744160324335, "rewards/rejected": -0.00799216516315937, "step": 21 }, { "epoch": 0.04702738810955244, "grad_norm": 4.514553831312047, "learning_rate": 4.6808510638297873e-07, "logits/chosen": -0.8653970956802368, "logits/rejected": -0.8456276059150696, "logps/chosen": -245.4098663330078, "logps/rejected": -248.41461181640625, "loss": 0.6928, "rewards/accuracies": 0.46875, "rewards/chosen": -0.005286152008920908, "rewards/margins": 0.000985494116321206, "rewards/rejected": -0.006271645426750183, "step": 22 }, { "epoch": 0.04916499665998664, "grad_norm": 4.80080663754473, "learning_rate": 4.893617021276595e-07, "logits/chosen": -0.8655314445495605, "logits/rejected": -0.8451917171478271, "logps/chosen": -252.33546447753906, "logps/rejected": -260.81475830078125, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": -0.006377603858709335, "rewards/margins": 0.00297079561278224, "rewards/rejected": -0.009348399937152863, "step": 23 }, { "epoch": 0.05130260521042084, "grad_norm": 5.481285264708149, "learning_rate": 5.106382978723403e-07, "logits/chosen": -0.7310451865196228, "logits/rejected": -0.7366085648536682, "logps/chosen": -238.02166748046875, "logps/rejected": -245.17308044433594, "loss": 0.6918, "rewards/accuracies": 0.5625, "rewards/chosen": -0.010805780068039894, "rewards/margins": 0.0019894172437489033, "rewards/rejected": -0.01279519684612751, "step": 24 }, { "epoch": 0.053440213760855046, "grad_norm": 4.561792775392447, "learning_rate": 5.319148936170212e-07, "logits/chosen": -0.9254141449928284, "logits/rejected": -0.939468502998352, "logps/chosen": -269.6241455078125, "logps/rejected": -282.4432067871094, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.01286405324935913, "rewards/margins": 0.001102518755942583, "rewards/rejected": -0.013966571539640427, "step": 25 }, { "epoch": 0.05557782231128924, "grad_norm": 4.85781011184185, "learning_rate": 5.531914893617021e-07, "logits/chosen": -0.8391819000244141, "logits/rejected": -0.8546662330627441, "logps/chosen": -271.26068115234375, "logps/rejected": -267.31024169921875, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008641035296022892, "rewards/margins": 0.0034573455341160297, "rewards/rejected": -0.012098381295800209, "step": 26 }, { "epoch": 0.05771543086172345, "grad_norm": 5.072033355975492, "learning_rate": 5.74468085106383e-07, "logits/chosen": -0.8844251036643982, "logits/rejected": -0.8849300742149353, "logps/chosen": -243.93980407714844, "logps/rejected": -248.54537963867188, "loss": 0.6927, "rewards/accuracies": 0.59375, "rewards/chosen": -0.015005933120846748, "rewards/margins": 0.0034025944769382477, "rewards/rejected": -0.018408527597784996, "step": 27 }, { "epoch": 0.05985303941215765, "grad_norm": 4.905934366826652, "learning_rate": 5.957446808510638e-07, "logits/chosen": -0.724337637424469, "logits/rejected": -0.7232470512390137, "logps/chosen": -262.2066345214844, "logps/rejected": -267.26116943359375, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.014386076480150223, "rewards/margins": -0.0017046784050762653, "rewards/rejected": -0.012681398540735245, "step": 28 }, { "epoch": 0.06199064796259185, "grad_norm": 4.7342802483142705, "learning_rate": 6.170212765957446e-07, "logits/chosen": -0.8244236707687378, "logits/rejected": -0.8045285940170288, "logps/chosen": -218.7688751220703, "logps/rejected": -219.35711669921875, "loss": 0.6898, "rewards/accuracies": 0.6875, "rewards/chosen": -0.014018207788467407, "rewards/margins": 0.00474612507969141, "rewards/rejected": -0.018764331936836243, "step": 29 }, { "epoch": 0.06412825651302605, "grad_norm": 5.185028135772882, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.7685129642486572, "logits/rejected": -0.7588883638381958, "logps/chosen": -265.58447265625, "logps/rejected": -271.6627502441406, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.023013589903712273, "rewards/margins": -6.0059886891394854e-05, "rewards/rejected": -0.02295352704823017, "step": 30 }, { "epoch": 0.06626586506346026, "grad_norm": 5.174402492219036, "learning_rate": 6.595744680851063e-07, "logits/chosen": -0.8060805797576904, "logits/rejected": -0.8104574084281921, "logps/chosen": -253.12918090820312, "logps/rejected": -262.47772216796875, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": -0.027180161327123642, "rewards/margins": 0.0009983510244637728, "rewards/rejected": -0.028178514912724495, "step": 31 }, { "epoch": 0.06840347361389446, "grad_norm": 4.839677584710031, "learning_rate": 6.808510638297872e-07, "logits/chosen": -0.8107847571372986, "logits/rejected": -0.8056558966636658, "logps/chosen": -247.47384643554688, "logps/rejected": -259.930419921875, "loss": 0.6922, "rewards/accuracies": 0.46875, "rewards/chosen": -0.03201708570122719, "rewards/margins": 0.0023030471056699753, "rewards/rejected": -0.03432013466954231, "step": 32 }, { "epoch": 0.07054108216432865, "grad_norm": 4.418696566904475, "learning_rate": 7.021276595744681e-07, "logits/chosen": -0.8691257834434509, "logits/rejected": -0.891472339630127, "logps/chosen": -229.89974975585938, "logps/rejected": -220.62893676757812, "loss": 0.6925, "rewards/accuracies": 0.46875, "rewards/chosen": -0.023654459044337273, "rewards/margins": 0.0017885996494442225, "rewards/rejected": -0.025443056598305702, "step": 33 }, { "epoch": 0.07267869071476286, "grad_norm": 5.281949481266581, "learning_rate": 7.23404255319149e-07, "logits/chosen": -0.7926970720291138, "logits/rejected": -0.7971447706222534, "logps/chosen": -201.50173950195312, "logps/rejected": -209.24432373046875, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": -0.0300833098590374, "rewards/margins": 0.011433225125074387, "rewards/rejected": -0.041516534984111786, "step": 34 }, { "epoch": 0.07481629926519706, "grad_norm": 5.310361096502114, "learning_rate": 7.446808510638297e-07, "logits/chosen": -0.910358726978302, "logits/rejected": -0.8681845664978027, "logps/chosen": -293.49481201171875, "logps/rejected": -264.9764709472656, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": -0.042106445878744125, "rewards/margins": -0.00045869359746575356, "rewards/rejected": -0.04164774715900421, "step": 35 }, { "epoch": 0.07695390781563126, "grad_norm": 4.880148293966411, "learning_rate": 7.659574468085106e-07, "logits/chosen": -0.9195268154144287, "logits/rejected": -0.9358838796615601, "logps/chosen": -219.29908752441406, "logps/rejected": -223.91160583496094, "loss": 0.6905, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03970751538872719, "rewards/margins": 0.009188718162477016, "rewards/rejected": -0.04889623448252678, "step": 36 }, { "epoch": 0.07909151636606547, "grad_norm": 4.918837324305735, "learning_rate": 7.872340425531915e-07, "logits/chosen": -0.7983888387680054, "logits/rejected": -0.7829576134681702, "logps/chosen": -236.22479248046875, "logps/rejected": -230.52279663085938, "loss": 0.6924, "rewards/accuracies": 0.625, "rewards/chosen": -0.03082606941461563, "rewards/margins": 0.007684895768761635, "rewards/rejected": -0.038510967046022415, "step": 37 }, { "epoch": 0.08122912491649967, "grad_norm": 4.697759235789417, "learning_rate": 8.085106382978723e-07, "logits/chosen": -0.9536780118942261, "logits/rejected": -0.9445628523826599, "logps/chosen": -239.7415771484375, "logps/rejected": -250.46978759765625, "loss": 0.6915, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04406914860010147, "rewards/margins": 0.007817601785063744, "rewards/rejected": -0.05188675597310066, "step": 38 }, { "epoch": 0.08336673346693386, "grad_norm": 4.942849749477713, "learning_rate": 8.297872340425532e-07, "logits/chosen": -0.8406745195388794, "logits/rejected": -0.8202511668205261, "logps/chosen": -283.8332824707031, "logps/rejected": -289.7784729003906, "loss": 0.6883, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0502498485147953, "rewards/margins": 0.014361884444952011, "rewards/rejected": -0.06461173295974731, "step": 39 }, { "epoch": 0.08550434201736808, "grad_norm": 5.117709083830907, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.8214735984802246, "logits/rejected": -0.811273992061615, "logps/chosen": -210.29600524902344, "logps/rejected": -199.48020935058594, "loss": 0.6884, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04497796297073364, "rewards/margins": 0.01133386418223381, "rewards/rejected": -0.05631183087825775, "step": 40 }, { "epoch": 0.08764195056780227, "grad_norm": 5.136196664411302, "learning_rate": 8.723404255319149e-07, "logits/chosen": -0.969085693359375, "logits/rejected": -0.9578003287315369, "logps/chosen": -252.95278930664062, "logps/rejected": -256.9606018066406, "loss": 0.6848, "rewards/accuracies": 0.4375, "rewards/chosen": -0.07370009273290634, "rewards/margins": 0.004004061222076416, "rewards/rejected": -0.07770414650440216, "step": 41 }, { "epoch": 0.08977955911823647, "grad_norm": 4.838693140519435, "learning_rate": 8.936170212765957e-07, "logits/chosen": -0.8661520481109619, "logits/rejected": -0.8457835912704468, "logps/chosen": -304.5137634277344, "logps/rejected": -289.595947265625, "loss": 0.6883, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07082202285528183, "rewards/margins": 0.015582293272018433, "rewards/rejected": -0.08640430867671967, "step": 42 }, { "epoch": 0.09191716766867067, "grad_norm": 4.957200914658608, "learning_rate": 9.148936170212766e-07, "logits/chosen": -0.8786011338233948, "logits/rejected": -0.8692121505737305, "logps/chosen": -241.05532836914062, "logps/rejected": -243.45684814453125, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.07279819995164871, "rewards/margins": 0.020279541611671448, "rewards/rejected": -0.09307773411273956, "step": 43 }, { "epoch": 0.09405477621910488, "grad_norm": 5.332532100522966, "learning_rate": 9.361702127659575e-07, "logits/chosen": -0.714208722114563, "logits/rejected": -0.7126749157905579, "logps/chosen": -319.6092834472656, "logps/rejected": -301.8595886230469, "loss": 0.6873, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07854731380939484, "rewards/margins": 0.005709480959922075, "rewards/rejected": -0.08425679802894592, "step": 44 }, { "epoch": 0.09619238476953908, "grad_norm": 5.165598994277126, "learning_rate": 9.574468085106384e-07, "logits/chosen": -0.8318406343460083, "logits/rejected": -0.849963903427124, "logps/chosen": -255.63446044921875, "logps/rejected": -259.7432556152344, "loss": 0.6883, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09484049677848816, "rewards/margins": 0.010637722909450531, "rewards/rejected": -0.10547823458909988, "step": 45 }, { "epoch": 0.09832999331997327, "grad_norm": 4.871720241221463, "learning_rate": 9.78723404255319e-07, "logits/chosen": -0.8702428936958313, "logits/rejected": -0.8339990377426147, "logps/chosen": -316.18670654296875, "logps/rejected": -329.9319152832031, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.14160332083702087, "rewards/margins": 0.013123790733516216, "rewards/rejected": -0.15472710132598877, "step": 46 }, { "epoch": 0.10046760187040749, "grad_norm": 5.158837089218199, "learning_rate": 1e-06, "logits/chosen": -0.8626521229743958, "logits/rejected": -0.8603638410568237, "logps/chosen": -247.8237762451172, "logps/rejected": -249.759033203125, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09033364802598953, "rewards/margins": 0.0012807990424335003, "rewards/rejected": -0.09161444753408432, "step": 47 }, { "epoch": 0.10260521042084168, "grad_norm": 5.2967028714823785, "learning_rate": 9.999860125306348e-07, "logits/chosen": -0.8659788370132446, "logits/rejected": -0.8618423342704773, "logps/chosen": -272.1561279296875, "logps/rejected": -280.98040771484375, "loss": 0.6893, "rewards/accuracies": 0.5625, "rewards/chosen": -0.11396947503089905, "rewards/margins": -0.0073202308267354965, "rewards/rejected": -0.106649249792099, "step": 48 }, { "epoch": 0.10474281897127588, "grad_norm": 5.51515197326478, "learning_rate": 9.999440509051367e-07, "logits/chosen": -0.7946774363517761, "logits/rejected": -0.8100728988647461, "logps/chosen": -302.84283447265625, "logps/rejected": -298.60955810546875, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1010870561003685, "rewards/margins": 0.017633313313126564, "rewards/rejected": -0.11872036755084991, "step": 49 }, { "epoch": 0.10688042752171009, "grad_norm": 5.870386943751237, "learning_rate": 9.998741174712533e-07, "logits/chosen": -0.90606290102005, "logits/rejected": -0.9065860509872437, "logps/chosen": -257.7372741699219, "logps/rejected": -241.87298583984375, "loss": 0.6821, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12090699374675751, "rewards/margins": 0.024912692606449127, "rewards/rejected": -0.14581969380378723, "step": 50 }, { "epoch": 0.10901803607214429, "grad_norm": 5.544085731964276, "learning_rate": 9.997762161417517e-07, "logits/chosen": -0.8597516417503357, "logits/rejected": -0.8242354393005371, "logps/chosen": -244.0271759033203, "logps/rejected": -262.000732421875, "loss": 0.6771, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11861881613731384, "rewards/margins": 0.04306299239397049, "rewards/rejected": -0.16168181598186493, "step": 51 }, { "epoch": 0.11115564462257849, "grad_norm": 5.08779280072292, "learning_rate": 9.996503523941992e-07, "logits/chosen": -0.8984640836715698, "logits/rejected": -0.8927853107452393, "logps/chosen": -292.3353576660156, "logps/rejected": -283.715576171875, "loss": 0.6878, "rewards/accuracies": 0.375, "rewards/chosen": -0.1378980576992035, "rewards/margins": -0.0063457973301410675, "rewards/rejected": -0.13155226409435272, "step": 52 }, { "epoch": 0.1132932531730127, "grad_norm": 5.528861132333211, "learning_rate": 9.994965332706572e-07, "logits/chosen": -0.7924266457557678, "logits/rejected": -0.7879197597503662, "logps/chosen": -299.14617919921875, "logps/rejected": -305.268798828125, "loss": 0.6822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.14223326742649078, "rewards/margins": 0.029797088354825974, "rewards/rejected": -0.17203034460544586, "step": 53 }, { "epoch": 0.1154308617234469, "grad_norm": 5.45602380596692, "learning_rate": 9.99314767377287e-07, "logits/chosen": -0.9068719744682312, "logits/rejected": -0.8776203393936157, "logps/chosen": -288.920166015625, "logps/rejected": -288.0073547363281, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": -0.12053351104259491, "rewards/margins": 0.04062645137310028, "rewards/rejected": -0.161159947514534, "step": 54 }, { "epoch": 0.11756847027388109, "grad_norm": 5.381814409133637, "learning_rate": 9.991050648838675e-07, "logits/chosen": -0.8684936165809631, "logits/rejected": -0.8693514466285706, "logps/chosen": -300.5417175292969, "logps/rejected": -297.93609619140625, "loss": 0.6852, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08863644301891327, "rewards/margins": 0.03236062452197075, "rewards/rejected": -0.12099706381559372, "step": 55 }, { "epoch": 0.1197060788243153, "grad_norm": 4.981246770478922, "learning_rate": 9.98867437523228e-07, "logits/chosen": -0.7902661561965942, "logits/rejected": -0.7963244915008545, "logps/chosen": -302.9090576171875, "logps/rejected": -296.0736389160156, "loss": 0.6823, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10501404106616974, "rewards/margins": 0.045005738735198975, "rewards/rejected": -0.15001976490020752, "step": 56 }, { "epoch": 0.1218436873747495, "grad_norm": 5.95244558017509, "learning_rate": 9.986018985905899e-07, "logits/chosen": -0.933331310749054, "logits/rejected": -0.9271438121795654, "logps/chosen": -257.21197509765625, "logps/rejected": -258.4394226074219, "loss": 0.6847, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1376655399799347, "rewards/margins": 0.020258434116840363, "rewards/rejected": -0.15792396664619446, "step": 57 }, { "epoch": 0.1239812959251837, "grad_norm": 5.625394184294828, "learning_rate": 9.983084629428244e-07, "logits/chosen": -0.790676474571228, "logits/rejected": -0.7989400625228882, "logps/chosen": -216.31825256347656, "logps/rejected": -239.0472869873047, "loss": 0.6822, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12253975123167038, "rewards/margins": 0.032085709273815155, "rewards/rejected": -0.15462547540664673, "step": 58 }, { "epoch": 0.1261189044756179, "grad_norm": 5.3018065912112835, "learning_rate": 9.979871469976195e-07, "logits/chosen": -0.7393543720245361, "logits/rejected": -0.7129000425338745, "logps/chosen": -311.56878662109375, "logps/rejected": -291.1382751464844, "loss": 0.6848, "rewards/accuracies": 0.4375, "rewards/chosen": -0.24215912818908691, "rewards/margins": -0.015711378306150436, "rewards/rejected": -0.22644776105880737, "step": 59 }, { "epoch": 0.1282565130260521, "grad_norm": 5.89246728884038, "learning_rate": 9.97637968732563e-07, "logits/chosen": -0.8696700930595398, "logits/rejected": -0.8711199760437012, "logps/chosen": -246.91502380371094, "logps/rejected": -262.1573791503906, "loss": 0.6851, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1573953628540039, "rewards/margins": 0.03878934681415558, "rewards/rejected": -0.1961846947669983, "step": 60 }, { "epoch": 0.1303941215764863, "grad_norm": 5.3302706046399555, "learning_rate": 9.972609476841365e-07, "logits/chosen": -0.915327787399292, "logits/rejected": -0.8959137201309204, "logps/chosen": -273.5627136230469, "logps/rejected": -297.04962158203125, "loss": 0.6851, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21575269103050232, "rewards/margins": 0.07058089226484299, "rewards/rejected": -0.2863335907459259, "step": 61 }, { "epoch": 0.13253173012692052, "grad_norm": 5.298125253303342, "learning_rate": 9.968561049466213e-07, "logits/chosen": -0.8035833239555359, "logits/rejected": -0.8177482485771179, "logps/chosen": -258.7190246582031, "logps/rejected": -260.00408935546875, "loss": 0.6761, "rewards/accuracies": 0.625, "rewards/chosen": -0.1654270738363266, "rewards/margins": 0.03356565535068512, "rewards/rejected": -0.19899272918701172, "step": 62 }, { "epoch": 0.1346693386773547, "grad_norm": 5.644014199691322, "learning_rate": 9.964234631709185e-07, "logits/chosen": -0.8946092128753662, "logits/rejected": -0.8983243703842163, "logps/chosen": -272.2535095214844, "logps/rejected": -278.0460205078125, "loss": 0.6812, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18145883083343506, "rewards/margins": 0.05900725722312927, "rewards/rejected": -0.24046610295772552, "step": 63 }, { "epoch": 0.1368069472277889, "grad_norm": 6.088482546530936, "learning_rate": 9.959630465632831e-07, "logits/chosen": -0.8606098890304565, "logits/rejected": -0.8623652458190918, "logps/chosen": -256.6067199707031, "logps/rejected": -273.53668212890625, "loss": 0.6753, "rewards/accuracies": 0.59375, "rewards/chosen": -0.17427769303321838, "rewards/margins": 0.05003924295306206, "rewards/rejected": -0.22431692481040955, "step": 64 }, { "epoch": 0.13894455577822312, "grad_norm": 5.611060962151761, "learning_rate": 9.954748808839674e-07, "logits/chosen": -0.8806792497634888, "logits/rejected": -0.8958165645599365, "logps/chosen": -275.52301025390625, "logps/rejected": -273.21563720703125, "loss": 0.6823, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2500859200954437, "rewards/margins": 0.017448339611291885, "rewards/rejected": -0.2675342261791229, "step": 65 }, { "epoch": 0.1410821643286573, "grad_norm": 5.918149017741809, "learning_rate": 9.949589934457814e-07, "logits/chosen": -0.8888027667999268, "logits/rejected": -0.871585488319397, "logps/chosen": -248.55703735351562, "logps/rejected": -258.9693603515625, "loss": 0.6824, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1963491439819336, "rewards/margins": 0.04409556835889816, "rewards/rejected": -0.24044471979141235, "step": 66 }, { "epoch": 0.14321977287909152, "grad_norm": 6.698179177771139, "learning_rate": 9.944154131125642e-07, "logits/chosen": -0.853302001953125, "logits/rejected": -0.848848819732666, "logps/chosen": -277.59442138671875, "logps/rejected": -297.14141845703125, "loss": 0.6639, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24216346442699432, "rewards/margins": 0.10889790952205658, "rewards/rejected": -0.3510614037513733, "step": 67 }, { "epoch": 0.14535738142952573, "grad_norm": 5.596769283806181, "learning_rate": 9.938441702975689e-07, "logits/chosen": -0.7764022350311279, "logits/rejected": -0.7560886144638062, "logps/chosen": -250.94287109375, "logps/rejected": -250.5952606201172, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": -0.2156543880701065, "rewards/margins": 0.03958458825945854, "rewards/rejected": -0.25523898005485535, "step": 68 }, { "epoch": 0.1474949899799599, "grad_norm": 5.913968144404886, "learning_rate": 9.932452969617607e-07, "logits/chosen": -0.7237470746040344, "logits/rejected": -0.7399138808250427, "logps/chosen": -244.21449279785156, "logps/rejected": -254.1151123046875, "loss": 0.6695, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1943284124135971, "rewards/margins": 0.060313306748867035, "rewards/rejected": -0.25464171171188354, "step": 69 }, { "epoch": 0.14963259853039412, "grad_norm": 5.940310444508497, "learning_rate": 9.926188266120295e-07, "logits/chosen": -0.8615679144859314, "logits/rejected": -0.8436312675476074, "logps/chosen": -256.257080078125, "logps/rejected": -262.7105712890625, "loss": 0.679, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17094658315181732, "rewards/margins": 0.055061645805835724, "rewards/rejected": -0.22600823640823364, "step": 70 }, { "epoch": 0.15177020708082833, "grad_norm": 5.928886998788439, "learning_rate": 9.919647942993147e-07, "logits/chosen": -0.8513661623001099, "logits/rejected": -0.8609136343002319, "logps/chosen": -299.2288818359375, "logps/rejected": -326.5621032714844, "loss": 0.6711, "rewards/accuracies": 0.625, "rewards/chosen": -0.22696439921855927, "rewards/margins": 0.04341430217027664, "rewards/rejected": -0.2703787088394165, "step": 71 }, { "epoch": 0.15390781563126252, "grad_norm": 5.791936546761731, "learning_rate": 9.912832366166441e-07, "logits/chosen": -0.756388783454895, "logits/rejected": -0.734666109085083, "logps/chosen": -299.2653503417969, "logps/rejected": -307.1020202636719, "loss": 0.6727, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3383556604385376, "rewards/margins": 0.01706152781844139, "rewards/rejected": -0.3554171919822693, "step": 72 }, { "epoch": 0.15604542418169673, "grad_norm": 6.05455714563325, "learning_rate": 9.905741916970863e-07, "logits/chosen": -0.9010551571846008, "logits/rejected": -0.8836992383003235, "logps/chosen": -339.32806396484375, "logps/rejected": -335.24285888671875, "loss": 0.6703, "rewards/accuracies": 0.65625, "rewards/chosen": -0.40806159377098083, "rewards/margins": -0.019612746313214302, "rewards/rejected": -0.38844889402389526, "step": 73 }, { "epoch": 0.15818303273213094, "grad_norm": 6.2106979275919025, "learning_rate": 9.898376992116177e-07, "logits/chosen": -0.9612334370613098, "logits/rejected": -0.9398088455200195, "logps/chosen": -282.431640625, "logps/rejected": -281.66558837890625, "loss": 0.6763, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3227473497390747, "rewards/margins": 0.047923244535923004, "rewards/rejected": -0.3706705868244171, "step": 74 }, { "epoch": 0.16032064128256512, "grad_norm": 5.916909013866816, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.8319401741027832, "logits/rejected": -0.815265953540802, "logps/chosen": -281.00518798828125, "logps/rejected": -273.9776306152344, "loss": 0.6605, "rewards/accuracies": 0.75, "rewards/chosen": -0.3827057480812073, "rewards/margins": 0.0613156333565712, "rewards/rejected": -0.4440213441848755, "step": 75 }, { "epoch": 0.16245824983299934, "grad_norm": 6.67755131316461, "learning_rate": 9.882825379029882e-07, "logits/chosen": -0.8953054547309875, "logits/rejected": -0.894780695438385, "logps/chosen": -312.055908203125, "logps/rejected": -330.704833984375, "loss": 0.6602, "rewards/accuracies": 0.5625, "rewards/chosen": -0.45433858036994934, "rewards/margins": 0.07449479401111603, "rewards/rejected": -0.5288333892822266, "step": 76 }, { "epoch": 0.16459585838343355, "grad_norm": 6.2117918809225765, "learning_rate": 9.874639560909118e-07, "logits/chosen": -0.9046330451965332, "logits/rejected": -0.898413360118866, "logps/chosen": -294.0129089355469, "logps/rejected": -299.68560791015625, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": -0.4101888835430145, "rewards/margins": 0.12080243229866028, "rewards/rejected": -0.5309913158416748, "step": 77 }, { "epoch": 0.16673346693386773, "grad_norm": 5.632634022928361, "learning_rate": 9.866181007302256e-07, "logits/chosen": -0.6335713267326355, "logits/rejected": -0.6313363313674927, "logps/chosen": -281.41400146484375, "logps/rejected": -291.63800048828125, "loss": 0.6659, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36711931228637695, "rewards/margins": 0.1256605088710785, "rewards/rejected": -0.49277979135513306, "step": 78 }, { "epoch": 0.16887107548430194, "grad_norm": 6.069106827042514, "learning_rate": 9.857450191464337e-07, "logits/chosen": -0.7797252535820007, "logits/rejected": -0.7820223569869995, "logps/chosen": -256.88421630859375, "logps/rejected": -279.4145812988281, "loss": 0.653, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3781868815422058, "rewards/margins": 0.07347656786441803, "rewards/rejected": -0.45166343450546265, "step": 79 }, { "epoch": 0.17100868403473615, "grad_norm": 6.074173522598461, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.8622775673866272, "logits/rejected": -0.8331011533737183, "logps/chosen": -309.4617614746094, "logps/rejected": -330.5566101074219, "loss": 0.6552, "rewards/accuracies": 0.625, "rewards/chosen": -0.49035871028900146, "rewards/margins": 0.17824454605579376, "rewards/rejected": -0.6686033010482788, "step": 80 }, { "epoch": 0.17314629258517034, "grad_norm": 6.5133260441754395, "learning_rate": 9.839173742253334e-07, "logits/chosen": -0.7489383816719055, "logits/rejected": -0.781232476234436, "logps/chosen": -296.9482116699219, "logps/rejected": -327.5967712402344, "loss": 0.6688, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5791828036308289, "rewards/margins": 0.188523530960083, "rewards/rejected": -0.7677063345909119, "step": 81 }, { "epoch": 0.17528390113560455, "grad_norm": 5.74672853077721, "learning_rate": 9.82962913144534e-07, "logits/chosen": -0.8432500958442688, "logits/rejected": -0.8211543560028076, "logps/chosen": -293.7790222167969, "logps/rejected": -304.9800720214844, "loss": 0.6522, "rewards/accuracies": 0.5625, "rewards/chosen": -0.47243934869766235, "rewards/margins": 0.12751685082912445, "rewards/rejected": -0.599956214427948, "step": 82 }, { "epoch": 0.17742150968603873, "grad_norm": 6.3990299421699675, "learning_rate": 9.819814303479267e-07, "logits/chosen": -0.9426258206367493, "logits/rejected": -0.9214622378349304, "logps/chosen": -290.99407958984375, "logps/rejected": -301.18212890625, "loss": 0.652, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49375712871551514, "rewards/margins": 0.1566104143857956, "rewards/rejected": -0.6503674983978271, "step": 83 }, { "epoch": 0.17955911823647294, "grad_norm": 6.534280177132367, "learning_rate": 9.80972980749353e-07, "logits/chosen": -0.8522071838378906, "logits/rejected": -0.8386092185974121, "logps/chosen": -345.668212890625, "logps/rejected": -346.40960693359375, "loss": 0.67, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6266711950302124, "rewards/margins": 0.09361431002616882, "rewards/rejected": -0.7202855348587036, "step": 84 }, { "epoch": 0.18169672678690715, "grad_norm": 6.649073031684906, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.7474217414855957, "logits/rejected": -0.7404229044914246, "logps/chosen": -275.940673828125, "logps/rejected": -290.2484130859375, "loss": 0.6365, "rewards/accuracies": 0.6875, "rewards/chosen": -0.41143321990966797, "rewards/margins": 0.08391736447811127, "rewards/rejected": -0.49535059928894043, "step": 85 }, { "epoch": 0.18383433533734134, "grad_norm": 6.963291541132159, "learning_rate": 9.788754083424652e-07, "logits/chosen": -0.824079692363739, "logits/rejected": -0.8041766285896301, "logps/chosen": -321.2813720703125, "logps/rejected": -339.7249450683594, "loss": 0.6636, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5758030414581299, "rewards/margins": 0.19611942768096924, "rewards/rejected": -0.7719224095344543, "step": 86 }, { "epoch": 0.18597194388777555, "grad_norm": 6.945463717696004, "learning_rate": 9.777864028930705e-07, "logits/chosen": -0.7686063647270203, "logits/rejected": -0.7663296461105347, "logps/chosen": -349.73004150390625, "logps/rejected": -375.2843017578125, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": -0.6469497680664062, "rewards/margins": 0.2732096314430237, "rewards/rejected": -0.9201593399047852, "step": 87 }, { "epoch": 0.18810955243820976, "grad_norm": 6.714366991925423, "learning_rate": 9.766706653529812e-07, "logits/chosen": -0.782423734664917, "logits/rejected": -0.7881312966346741, "logps/chosen": -301.2457275390625, "logps/rejected": -310.0863037109375, "loss": 0.6652, "rewards/accuracies": 0.5, "rewards/chosen": -0.5998435616493225, "rewards/margins": 0.09575268626213074, "rewards/rejected": -0.6955962777137756, "step": 88 }, { "epoch": 0.19024716098864394, "grad_norm": 7.241214530195881, "learning_rate": 9.755282581475767e-07, "logits/chosen": -0.8655251860618591, "logits/rejected": -0.8472452163696289, "logps/chosen": -398.3143310546875, "logps/rejected": -434.9195556640625, "loss": 0.6159, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8902552723884583, "rewards/margins": 0.29991066455841064, "rewards/rejected": -1.1901659965515137, "step": 89 }, { "epoch": 0.19238476953907815, "grad_norm": 7.90505927903396, "learning_rate": 9.743592451943998e-07, "logits/chosen": -0.8578193783760071, "logits/rejected": -0.8561904430389404, "logps/chosen": -281.01824951171875, "logps/rejected": -304.5150146484375, "loss": 0.6939, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7082527875900269, "rewards/margins": 0.08812718093395233, "rewards/rejected": -0.7963800430297852, "step": 90 }, { "epoch": 0.19452237808951237, "grad_norm": 7.5921079251944, "learning_rate": 9.73163691899582e-07, "logits/chosen": -0.678159236907959, "logits/rejected": -0.6668828725814819, "logps/chosen": -300.15338134765625, "logps/rejected": -306.63525390625, "loss": 0.6812, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6471429467201233, "rewards/margins": 0.08188958466053009, "rewards/rejected": -0.729032576084137, "step": 91 }, { "epoch": 0.19665998663994655, "grad_norm": 7.137628936459269, "learning_rate": 9.719416651541837e-07, "logits/chosen": -0.8150886297225952, "logits/rejected": -0.8088028430938721, "logps/chosen": -431.6229248046875, "logps/rejected": -458.9399108886719, "loss": 0.643, "rewards/accuracies": 0.75, "rewards/chosen": -1.0566003322601318, "rewards/margins": 0.2619227468967438, "rewards/rejected": -1.3185230493545532, "step": 92 }, { "epoch": 0.19879759519038076, "grad_norm": 6.729473146383851, "learning_rate": 9.706932333304517e-07, "logits/chosen": -0.8243950605392456, "logits/rejected": -0.838744580745697, "logps/chosen": -312.406494140625, "logps/rejected": -335.4088134765625, "loss": 0.6498, "rewards/accuracies": 0.46875, "rewards/chosen": -0.7556511759757996, "rewards/margins": 0.031043091788887978, "rewards/rejected": -0.7866942882537842, "step": 93 }, { "epoch": 0.20093520374081497, "grad_norm": 6.624154045427617, "learning_rate": 9.694184662779929e-07, "logits/chosen": -0.783348560333252, "logits/rejected": -0.7991134524345398, "logps/chosen": -289.2900695800781, "logps/rejected": -290.5962829589844, "loss": 0.6525, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6890867352485657, "rewards/margins": 0.08654731512069702, "rewards/rejected": -0.7756341099739075, "step": 94 }, { "epoch": 0.20307281229124916, "grad_norm": 7.588312119029146, "learning_rate": 9.681174353198686e-07, "logits/chosen": -0.8928613066673279, "logits/rejected": -0.9167020916938782, "logps/chosen": -263.0621032714844, "logps/rejected": -291.3228759765625, "loss": 0.6785, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6083306670188904, "rewards/margins": 0.10731954127550125, "rewards/rejected": -0.715650200843811, "step": 95 }, { "epoch": 0.20521042084168337, "grad_norm": 7.93175048638323, "learning_rate": 9.667902132486008e-07, "logits/chosen": -0.7266509532928467, "logits/rejected": -0.7005448341369629, "logps/chosen": -355.4562072753906, "logps/rejected": -368.688232421875, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": -0.9357940554618835, "rewards/margins": 0.1980743259191513, "rewards/rejected": -1.1338684558868408, "step": 96 }, { "epoch": 0.20734802939211758, "grad_norm": 7.282370392547328, "learning_rate": 9.65436874322102e-07, "logits/chosen": -0.7565743327140808, "logits/rejected": -0.765534520149231, "logps/chosen": -360.4274597167969, "logps/rejected": -397.3307189941406, "loss": 0.6365, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9580825567245483, "rewards/margins": 0.27880433201789856, "rewards/rejected": -1.2368868589401245, "step": 97 }, { "epoch": 0.20948563794255176, "grad_norm": 7.307890632023091, "learning_rate": 9.640574942595194e-07, "logits/chosen": -0.6865275502204895, "logits/rejected": -0.6510294079780579, "logps/chosen": -299.5666198730469, "logps/rejected": -315.7306823730469, "loss": 0.637, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6532863974571228, "rewards/margins": 0.1549152433872223, "rewards/rejected": -0.8082016706466675, "step": 98 }, { "epoch": 0.21162324649298597, "grad_norm": 7.447581281931192, "learning_rate": 9.626521502369983e-07, "logits/chosen": -0.6352126598358154, "logits/rejected": -0.6191614866256714, "logps/chosen": -293.2029113769531, "logps/rejected": -306.13330078125, "loss": 0.6658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7388824224472046, "rewards/margins": 0.15435971319675446, "rewards/rejected": -0.8932421803474426, "step": 99 }, { "epoch": 0.21376085504342018, "grad_norm": 6.648161187751906, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.7408478856086731, "logits/rejected": -0.7513828277587891, "logps/chosen": -301.5423583984375, "logps/rejected": -345.68682861328125, "loss": 0.628, "rewards/accuracies": 0.75, "rewards/chosen": -0.7927481532096863, "rewards/margins": 0.24078083038330078, "rewards/rejected": -1.0335289239883423, "step": 100 }, { "epoch": 0.21376085504342018, "eval_logits/chosen": -0.7527643442153931, "eval_logits/rejected": -0.7538674473762512, "eval_logps/chosen": -343.6059875488281, "eval_logps/rejected": -362.7133483886719, "eval_loss": 0.6641345024108887, "eval_rewards/accuracies": 0.6239837408065796, "eval_rewards/chosen": -0.8805798888206482, "eval_rewards/margins": 0.1340140700340271, "eval_rewards/rejected": -1.0145939588546753, "eval_runtime": 372.3126, "eval_samples_per_second": 5.267, "eval_steps_per_second": 0.33, "step": 100 }, { "epoch": 0.21589846359385437, "grad_norm": 7.778718674441958, "learning_rate": 9.597638862757253e-07, "logits/chosen": -0.8201433420181274, "logits/rejected": -0.8069182634353638, "logps/chosen": -256.0120849609375, "logps/rejected": -269.8443603515625, "loss": 0.6831, "rewards/accuracies": 0.5625, "rewards/chosen": -0.656363844871521, "rewards/margins": 0.05751778930425644, "rewards/rejected": -0.7138815522193909, "step": 101 }, { "epoch": 0.21803607214428858, "grad_norm": 7.5706021854045185, "learning_rate": 9.58281127934988e-07, "logits/chosen": -0.6860804557800293, "logits/rejected": -0.7110453844070435, "logps/chosen": -368.2939453125, "logps/rejected": -393.86029052734375, "loss": 0.6576, "rewards/accuracies": 0.6875, "rewards/chosen": -1.020776391029358, "rewards/margins": 0.1516759693622589, "rewards/rejected": -1.172452449798584, "step": 102 }, { "epoch": 0.2201736806947228, "grad_norm": 8.607842129213472, "learning_rate": 9.567727288213004e-07, "logits/chosen": -0.7699592113494873, "logits/rejected": -0.7589491605758667, "logps/chosen": -324.6326904296875, "logps/rejected": -358.59820556640625, "loss": 0.7094, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0056777000427246, "rewards/margins": 0.18924392759799957, "rewards/rejected": -1.1949217319488525, "step": 103 }, { "epoch": 0.22231128924515697, "grad_norm": 7.291755560282041, "learning_rate": 9.552387733294078e-07, "logits/chosen": -0.6555180549621582, "logits/rejected": -0.6659807562828064, "logps/chosen": -330.6410827636719, "logps/rejected": -359.6870422363281, "loss": 0.6453, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8741835355758667, "rewards/margins": 0.2183988094329834, "rewards/rejected": -1.09258234500885, "step": 104 }, { "epoch": 0.22444889779559118, "grad_norm": 7.775554579983475, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.6701323986053467, "logits/rejected": -0.6589778661727905, "logps/chosen": -285.3506164550781, "logps/rejected": -288.3831481933594, "loss": 0.6488, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7929357290267944, "rewards/margins": 0.09136777371168137, "rewards/rejected": -0.8843034505844116, "step": 105 }, { "epoch": 0.2265865063460254, "grad_norm": 7.266567693754681, "learning_rate": 9.520945379345699e-07, "logits/chosen": -0.8183209300041199, "logits/rejected": -0.8361554741859436, "logps/chosen": -397.4153747558594, "logps/rejected": -423.17333984375, "loss": 0.6383, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1003904342651367, "rewards/margins": 0.1266530454158783, "rewards/rejected": -1.2270435094833374, "step": 106 }, { "epoch": 0.22872411489645958, "grad_norm": 7.518282958403423, "learning_rate": 9.504844339512094e-07, "logits/chosen": -0.8879948854446411, "logits/rejected": -0.8571330904960632, "logps/chosen": -287.59051513671875, "logps/rejected": -297.351318359375, "loss": 0.6476, "rewards/accuracies": 0.625, "rewards/chosen": -0.7065630555152893, "rewards/margins": 0.15939508378505707, "rewards/rejected": -0.8659580945968628, "step": 107 }, { "epoch": 0.2308617234468938, "grad_norm": 7.824671981101, "learning_rate": 9.488491254189716e-07, "logits/chosen": -0.8066489696502686, "logits/rejected": -0.8055952191352844, "logps/chosen": -404.3518981933594, "logps/rejected": -442.6438903808594, "loss": 0.6404, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0566518306732178, "rewards/margins": 0.3284587264060974, "rewards/rejected": -1.3851103782653809, "step": 108 }, { "epoch": 0.232999331997328, "grad_norm": 8.83083617083813, "learning_rate": 9.471887038331684e-07, "logits/chosen": -0.7246598601341248, "logits/rejected": -0.7441533207893372, "logps/chosen": -354.1577453613281, "logps/rejected": -366.261962890625, "loss": 0.6873, "rewards/accuracies": 0.75, "rewards/chosen": -0.9086767435073853, "rewards/margins": 0.14375557005405426, "rewards/rejected": -1.0524324178695679, "step": 109 }, { "epoch": 0.23513694054776219, "grad_norm": 6.762910416252425, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.7163547277450562, "logits/rejected": -0.7031821608543396, "logps/chosen": -281.1316833496094, "logps/rejected": -283.372314453125, "loss": 0.6652, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6051456332206726, "rewards/margins": 0.09137356281280518, "rewards/rejected": -0.6965191960334778, "step": 110 }, { "epoch": 0.2372745490981964, "grad_norm": 7.354791673779593, "learning_rate": 9.43792894502277e-07, "logits/chosen": -0.6413677334785461, "logits/rejected": -0.6314007043838501, "logps/chosen": -341.87396240234375, "logps/rejected": -356.4854736328125, "loss": 0.6642, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8350028395652771, "rewards/margins": 0.21570800244808197, "rewards/rejected": -1.050710916519165, "step": 111 }, { "epoch": 0.2394121576486306, "grad_norm": 7.625646719699033, "learning_rate": 9.420576967523048e-07, "logits/chosen": -0.7540197968482971, "logits/rejected": -0.7288798093795776, "logps/chosen": -290.5899963378906, "logps/rejected": -294.30804443359375, "loss": 0.6563, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6742240786552429, "rewards/margins": 0.1976398229598999, "rewards/rejected": -0.8718639612197876, "step": 112 }, { "epoch": 0.2415497661990648, "grad_norm": 7.749312449639858, "learning_rate": 9.402977659283689e-07, "logits/chosen": -0.773981511592865, "logits/rejected": -0.7674249410629272, "logps/chosen": -323.57000732421875, "logps/rejected": -349.71990966796875, "loss": 0.6365, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7962589859962463, "rewards/margins": 0.18393486738204956, "rewards/rejected": -0.9801937937736511, "step": 113 }, { "epoch": 0.243687374749499, "grad_norm": 7.4503816098925055, "learning_rate": 9.385132004983832e-07, "logits/chosen": -0.7875250577926636, "logits/rejected": -0.7886217832565308, "logps/chosen": -289.820068359375, "logps/rejected": -307.18914794921875, "loss": 0.6351, "rewards/accuracies": 0.625, "rewards/chosen": -0.6762690544128418, "rewards/margins": 0.16321714222431183, "rewards/rejected": -0.8394861817359924, "step": 114 }, { "epoch": 0.2458249832999332, "grad_norm": 7.383473143295883, "learning_rate": 9.367041003085648e-07, "logits/chosen": -0.811254620552063, "logits/rejected": -0.8413273692131042, "logps/chosen": -328.42877197265625, "logps/rejected": -360.35430908203125, "loss": 0.6373, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6836004257202148, "rewards/margins": 0.1819692850112915, "rewards/rejected": -0.8655696511268616, "step": 115 }, { "epoch": 0.2479625918503674, "grad_norm": 6.933138308165148, "learning_rate": 9.348705665778477e-07, "logits/chosen": -0.7606134414672852, "logits/rejected": -0.7490028142929077, "logps/chosen": -342.7862548828125, "logps/rejected": -355.22943115234375, "loss": 0.6449, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9342496991157532, "rewards/margins": 0.09783473610877991, "rewards/rejected": -1.0320844650268555, "step": 116 }, { "epoch": 0.2501002004008016, "grad_norm": 6.9991891498789744, "learning_rate": 9.330127018922193e-07, "logits/chosen": -0.7081186771392822, "logits/rejected": -0.7329989075660706, "logps/chosen": -361.0794372558594, "logps/rejected": -369.33721923828125, "loss": 0.6483, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8992618322372437, "rewards/margins": 0.12093706429004669, "rewards/rejected": -1.020198941230774, "step": 117 }, { "epoch": 0.2522378089512358, "grad_norm": 6.891968237145087, "learning_rate": 9.311306101989812e-07, "logits/chosen": -0.7707226872444153, "logits/rejected": -0.775468111038208, "logps/chosen": -328.4278869628906, "logps/rejected": -375.73712158203125, "loss": 0.6215, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7947558164596558, "rewards/margins": 0.2816506326198578, "rewards/rejected": -1.076406478881836, "step": 118 }, { "epoch": 0.25437541750167003, "grad_norm": 7.78238688784484, "learning_rate": 9.29224396800933e-07, "logits/chosen": -0.8061501383781433, "logits/rejected": -0.7823886275291443, "logps/chosen": -322.4601135253906, "logps/rejected": -329.06744384765625, "loss": 0.6551, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8091481924057007, "rewards/margins": -0.014971929602324963, "rewards/rejected": -0.7941762208938599, "step": 119 }, { "epoch": 0.2565130260521042, "grad_norm": 7.538013946361546, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.7215307950973511, "logits/rejected": -0.7078826427459717, "logps/chosen": -356.06005859375, "logps/rejected": -357.49774169921875, "loss": 0.6384, "rewards/accuracies": 0.625, "rewards/chosen": -0.8885184526443481, "rewards/margins": 0.18230639398097992, "rewards/rejected": -1.0708248615264893, "step": 120 }, { "epoch": 0.2586506346025384, "grad_norm": 7.545420813102953, "learning_rate": 9.253400328436698e-07, "logits/chosen": -0.7346601486206055, "logits/rejected": -0.7335522174835205, "logps/chosen": -344.805419921875, "logps/rejected": -350.87725830078125, "loss": 0.6594, "rewards/accuracies": 0.59375, "rewards/chosen": -0.8253999948501587, "rewards/margins": 0.0806780755519867, "rewards/rejected": -0.9060779809951782, "step": 121 }, { "epoch": 0.2607882431529726, "grad_norm": 7.7074461457899535, "learning_rate": 9.233620996141421e-07, "logits/chosen": -0.8815721273422241, "logits/rejected": -0.8621220588684082, "logps/chosen": -336.6763610839844, "logps/rejected": -341.74798583984375, "loss": 0.6341, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7817858457565308, "rewards/margins": 0.07886642962694168, "rewards/rejected": -0.8606522083282471, "step": 122 }, { "epoch": 0.2629258517034068, "grad_norm": 7.761484323525846, "learning_rate": 9.213604793270196e-07, "logits/chosen": -0.8222033977508545, "logits/rejected": -0.8148404955863953, "logps/chosen": -303.2247009277344, "logps/rejected": -315.91888427734375, "loss": 0.6419, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7392472624778748, "rewards/margins": 0.13012456893920898, "rewards/rejected": -0.869371771812439, "step": 123 }, { "epoch": 0.26506346025384103, "grad_norm": 8.151352928349633, "learning_rate": 9.19335283972712e-07, "logits/chosen": -0.7656688690185547, "logits/rejected": -0.7709140181541443, "logps/chosen": -374.8747253417969, "logps/rejected": -376.098876953125, "loss": 0.6754, "rewards/accuracies": 0.5625, "rewards/chosen": -1.005488634109497, "rewards/margins": 0.06390834599733353, "rewards/rejected": -1.06939697265625, "step": 124 }, { "epoch": 0.26720106880427524, "grad_norm": 7.63262703375028, "learning_rate": 9.172866268606513e-07, "logits/chosen": -0.755223274230957, "logits/rejected": -0.7677374482154846, "logps/chosen": -372.7818603515625, "logps/rejected": -385.9354248046875, "loss": 0.6662, "rewards/accuracies": 0.75, "rewards/chosen": -0.8667163848876953, "rewards/margins": 0.20413543283939362, "rewards/rejected": -1.0708518028259277, "step": 125 }, { "epoch": 0.2693386773547094, "grad_norm": 7.478834701874977, "learning_rate": 9.152146226129518e-07, "logits/chosen": -0.7996259927749634, "logits/rejected": -0.7835624814033508, "logps/chosen": -292.76129150390625, "logps/rejected": -333.20477294921875, "loss": 0.6172, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7281415462493896, "rewards/margins": 0.3209742605686188, "rewards/rejected": -1.049115777015686, "step": 126 }, { "epoch": 0.2714762859051436, "grad_norm": 7.082169803011623, "learning_rate": 9.131193871579974e-07, "logits/chosen": -0.8138784766197205, "logits/rejected": -0.829187273979187, "logps/chosen": -353.7518615722656, "logps/rejected": -404.1153564453125, "loss": 0.6436, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8756864070892334, "rewards/margins": 0.2420441061258316, "rewards/rejected": -1.1177304983139038, "step": 127 }, { "epoch": 0.2736138944555778, "grad_norm": 7.237862112577957, "learning_rate": 9.11001037723955e-07, "logits/chosen": -0.7936111688613892, "logits/rejected": -0.8008431196212769, "logps/chosen": -332.17718505859375, "logps/rejected": -352.5616760253906, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": -0.7826520800590515, "rewards/margins": 0.20596377551555634, "rewards/rejected": -0.988615870475769, "step": 128 }, { "epoch": 0.27575150300601203, "grad_norm": 8.604847241866956, "learning_rate": 9.088596928322157e-07, "logits/chosen": -0.8067824840545654, "logits/rejected": -0.8039845824241638, "logps/chosen": -333.2156982421875, "logps/rejected": -357.6597595214844, "loss": 0.6587, "rewards/accuracies": 0.59375, "rewards/chosen": -0.787762463092804, "rewards/margins": 0.017246991395950317, "rewards/rejected": -0.8050093650817871, "step": 129 }, { "epoch": 0.27788911155644624, "grad_norm": 8.324207089057085, "learning_rate": 9.066954722907638e-07, "logits/chosen": -0.6775297522544861, "logits/rejected": -0.7070217132568359, "logps/chosen": -324.43402099609375, "logps/rejected": -350.53851318359375, "loss": 0.645, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7566977739334106, "rewards/margins": 0.24282482266426086, "rewards/rejected": -0.9995224475860596, "step": 130 }, { "epoch": 0.2800267201068804, "grad_norm": 7.364170729863742, "learning_rate": 9.045084971874737e-07, "logits/chosen": -0.7260534167289734, "logits/rejected": -0.7187973260879517, "logps/chosen": -294.4010925292969, "logps/rejected": -310.21136474609375, "loss": 0.6398, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7721171379089355, "rewards/margins": 0.15222765505313873, "rewards/rejected": -0.9243447184562683, "step": 131 }, { "epoch": 0.2821643286573146, "grad_norm": 7.35116325693309, "learning_rate": 9.022988898833342e-07, "logits/chosen": -0.7463628053665161, "logits/rejected": -0.7459514141082764, "logps/chosen": -329.623779296875, "logps/rejected": -356.4615783691406, "loss": 0.5991, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8810457587242126, "rewards/margins": 0.1869642734527588, "rewards/rejected": -1.0680099725723267, "step": 132 }, { "epoch": 0.2843019372077488, "grad_norm": 7.43517337943034, "learning_rate": 9.000667740056032e-07, "logits/chosen": -0.7253285646438599, "logits/rejected": -0.7020008563995361, "logps/chosen": -341.2428894042969, "logps/rejected": -399.8907470703125, "loss": 0.6251, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9917829036712646, "rewards/margins": 0.3243829011917114, "rewards/rejected": -1.3161659240722656, "step": 133 }, { "epoch": 0.28643954575818303, "grad_norm": 8.02042016596172, "learning_rate": 8.978122744408905e-07, "logits/chosen": -0.6935924887657166, "logits/rejected": -0.6478650569915771, "logps/chosen": -383.7857971191406, "logps/rejected": -403.4067077636719, "loss": 0.6472, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0208508968353271, "rewards/margins": 0.2477567493915558, "rewards/rejected": -1.2686076164245605, "step": 134 }, { "epoch": 0.28857715430861725, "grad_norm": 7.085674453391065, "learning_rate": 8.955355173281707e-07, "logits/chosen": -0.7271559238433838, "logits/rejected": -0.7309106588363647, "logps/chosen": -310.3777160644531, "logps/rejected": -329.25323486328125, "loss": 0.6007, "rewards/accuracies": 0.59375, "rewards/chosen": -0.7959171533584595, "rewards/margins": 0.21840126812458038, "rewards/rejected": -1.0143184661865234, "step": 135 }, { "epoch": 0.29071476285905146, "grad_norm": 7.837723075107936, "learning_rate": 8.932366300517249e-07, "logits/chosen": -0.771674633026123, "logits/rejected": -0.7675716280937195, "logps/chosen": -381.0829772949219, "logps/rejected": -408.50616455078125, "loss": 0.6332, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0345312356948853, "rewards/margins": 0.18286427855491638, "rewards/rejected": -1.217395544052124, "step": 136 }, { "epoch": 0.2928523714094856, "grad_norm": 9.181399284387098, "learning_rate": 8.909157412340149e-07, "logits/chosen": -0.837311863899231, "logits/rejected": -0.8280692100524902, "logps/chosen": -368.6721496582031, "logps/rejected": -397.39056396484375, "loss": 0.672, "rewards/accuracies": 0.53125, "rewards/chosen": -1.0871038436889648, "rewards/margins": 0.1502176821231842, "rewards/rejected": -1.2373214960098267, "step": 137 }, { "epoch": 0.2949899799599198, "grad_norm": 8.547486747659995, "learning_rate": 8.885729807284854e-07, "logits/chosen": -0.6511350274085999, "logits/rejected": -0.6316956877708435, "logps/chosen": -367.9530029296875, "logps/rejected": -376.24652099609375, "loss": 0.6666, "rewards/accuracies": 0.625, "rewards/chosen": -1.1004064083099365, "rewards/margins": 0.1894245594739914, "rewards/rejected": -1.2898309230804443, "step": 138 }, { "epoch": 0.29712758851035403, "grad_norm": 7.209713050210504, "learning_rate": 8.862084796122997e-07, "logits/chosen": -0.7271043658256531, "logits/rejected": -0.7313827276229858, "logps/chosen": -305.42919921875, "logps/rejected": -366.8320007324219, "loss": 0.6285, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8596370816230774, "rewards/margins": 0.34605735540390015, "rewards/rejected": -1.205694556236267, "step": 139 }, { "epoch": 0.29926519706078825, "grad_norm": 8.777840373189521, "learning_rate": 8.838223701790055e-07, "logits/chosen": -0.8329405188560486, "logits/rejected": -0.8425594568252563, "logps/chosen": -334.819580078125, "logps/rejected": -353.1991882324219, "loss": 0.6789, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9176943898200989, "rewards/margins": 0.08534470945596695, "rewards/rejected": -1.0030391216278076, "step": 140 }, { "epoch": 0.30140280561122246, "grad_norm": 8.349695032017713, "learning_rate": 8.814147859311332e-07, "logits/chosen": -0.7287541031837463, "logits/rejected": -0.747150182723999, "logps/chosen": -338.96990966796875, "logps/rejected": -393.1916809082031, "loss": 0.6085, "rewards/accuracies": 0.78125, "rewards/chosen": -0.868696928024292, "rewards/margins": 0.3027462959289551, "rewards/rejected": -1.1714433431625366, "step": 141 }, { "epoch": 0.30354041416165667, "grad_norm": 8.507916003943253, "learning_rate": 8.789858615727264e-07, "logits/chosen": -0.6775808930397034, "logits/rejected": -0.6213993430137634, "logps/chosen": -374.7777099609375, "logps/rejected": -441.28265380859375, "loss": 0.5921, "rewards/accuracies": 0.8125, "rewards/chosen": -1.076945424079895, "rewards/margins": 0.4260719418525696, "rewards/rejected": -1.5030174255371094, "step": 142 }, { "epoch": 0.3056780227120908, "grad_norm": 8.266582578388473, "learning_rate": 8.765357330018055e-07, "logits/chosen": -0.7523927092552185, "logits/rejected": -0.7748714685440063, "logps/chosen": -353.6466064453125, "logps/rejected": -402.60662841796875, "loss": 0.625, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0103652477264404, "rewards/margins": 0.3086761236190796, "rewards/rejected": -1.31904137134552, "step": 143 }, { "epoch": 0.30781563126252504, "grad_norm": 8.078736110217639, "learning_rate": 8.740645373027634e-07, "logits/chosen": -0.72418212890625, "logits/rejected": -0.7301138639450073, "logps/chosen": -414.23004150390625, "logps/rejected": -465.2354736328125, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": -1.1818435192108154, "rewards/margins": 0.26852208375930786, "rewards/rejected": -1.450365662574768, "step": 144 }, { "epoch": 0.30995323981295925, "grad_norm": 8.551096723015775, "learning_rate": 8.71572412738697e-07, "logits/chosen": -0.7613222599029541, "logits/rejected": -0.7519202828407288, "logps/chosen": -376.8845520019531, "logps/rejected": -391.2483215332031, "loss": 0.6422, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2536505460739136, "rewards/margins": 0.05302443727850914, "rewards/rejected": -1.3066749572753906, "step": 145 }, { "epoch": 0.31209084836339346, "grad_norm": 10.264598567431593, "learning_rate": 8.690594987436704e-07, "logits/chosen": -0.6667072772979736, "logits/rejected": -0.651785135269165, "logps/chosen": -407.5121765136719, "logps/rejected": -414.15325927734375, "loss": 0.7022, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3455419540405273, "rewards/margins": 0.1597069501876831, "rewards/rejected": -1.5052489042282104, "step": 146 }, { "epoch": 0.31422845691382767, "grad_norm": 8.003275854261016, "learning_rate": 8.66525935914913e-07, "logits/chosen": -0.70644611120224, "logits/rejected": -0.7072776556015015, "logps/chosen": -298.8578186035156, "logps/rejected": -352.6321105957031, "loss": 0.6026, "rewards/accuracies": 0.75, "rewards/chosen": -0.7530163526535034, "rewards/margins": 0.3677568733692169, "rewards/rejected": -1.1207730770111084, "step": 147 }, { "epoch": 0.3163660654642619, "grad_norm": 9.622881147148561, "learning_rate": 8.639718660049554e-07, "logits/chosen": -0.7758994102478027, "logits/rejected": -0.7696230411529541, "logps/chosen": -305.4625549316406, "logps/rejected": -307.0013732910156, "loss": 0.6654, "rewards/accuracies": 0.53125, "rewards/chosen": -0.9157548546791077, "rewards/margins": 0.10377232730388641, "rewards/rejected": -1.0195271968841553, "step": 148 }, { "epoch": 0.31850367401469604, "grad_norm": 9.830442606055694, "learning_rate": 8.613974319136957e-07, "logits/chosen": -0.6808797121047974, "logits/rejected": -0.6591075658798218, "logps/chosen": -328.95526123046875, "logps/rejected": -344.4721374511719, "loss": 0.653, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1110159158706665, "rewards/margins": 0.16696244478225708, "rewards/rejected": -1.2779783010482788, "step": 149 }, { "epoch": 0.32064128256513025, "grad_norm": 8.745762248320213, "learning_rate": 8.588027776804058e-07, "logits/chosen": -0.7875892519950867, "logits/rejected": -0.7677904963493347, "logps/chosen": -357.419677734375, "logps/rejected": -373.40460205078125, "loss": 0.6461, "rewards/accuracies": 0.625, "rewards/chosen": -1.0266187191009521, "rewards/margins": 0.21186724305152893, "rewards/rejected": -1.2384859323501587, "step": 150 }, { "epoch": 0.32277889111556446, "grad_norm": 8.312609091320738, "learning_rate": 8.561880484756724e-07, "logits/chosen": -0.7948569059371948, "logits/rejected": -0.7845500707626343, "logps/chosen": -341.4780578613281, "logps/rejected": -384.87615966796875, "loss": 0.622, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9681000113487244, "rewards/margins": 0.3343212306499481, "rewards/rejected": -1.30242121219635, "step": 151 }, { "epoch": 0.32491649966599867, "grad_norm": 9.180312248349901, "learning_rate": 8.535533905932737e-07, "logits/chosen": -0.7717313170433044, "logits/rejected": -0.7632758617401123, "logps/chosen": -349.5531921386719, "logps/rejected": -348.3159484863281, "loss": 0.6628, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1458628177642822, "rewards/margins": 0.07357059419155121, "rewards/rejected": -1.2194334268569946, "step": 152 }, { "epoch": 0.3270541082164329, "grad_norm": 8.40986643995496, "learning_rate": 8.508989514419958e-07, "logits/chosen": -0.6287474036216736, "logits/rejected": -0.5992534160614014, "logps/chosen": -327.4925842285156, "logps/rejected": -357.3055725097656, "loss": 0.6299, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0919466018676758, "rewards/margins": 0.2580554485321045, "rewards/rejected": -1.3500020503997803, "step": 153 }, { "epoch": 0.3291917167668671, "grad_norm": 9.217933303499299, "learning_rate": 8.482248795373835e-07, "logits/chosen": -0.7915253639221191, "logits/rejected": -0.7664984464645386, "logps/chosen": -368.6262512207031, "logps/rejected": -391.03564453125, "loss": 0.6426, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0133848190307617, "rewards/margins": 0.11703261733055115, "rewards/rejected": -1.1304173469543457, "step": 154 }, { "epoch": 0.33132932531730125, "grad_norm": 8.472153097719477, "learning_rate": 8.455313244934324e-07, "logits/chosen": -0.8312329649925232, "logits/rejected": -0.8426264524459839, "logps/chosen": -341.4083251953125, "logps/rejected": -377.6736145019531, "loss": 0.6149, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0413603782653809, "rewards/margins": 0.24394717812538147, "rewards/rejected": -1.2853076457977295, "step": 155 }, { "epoch": 0.33346693386773546, "grad_norm": 8.186342714745207, "learning_rate": 8.428184370142171e-07, "logits/chosen": -0.6921215653419495, "logits/rejected": -0.7096705436706543, "logps/chosen": -363.21539306640625, "logps/rejected": -384.2535400390625, "loss": 0.6144, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9978117346763611, "rewards/margins": 0.18856188654899597, "rewards/rejected": -1.1863737106323242, "step": 156 }, { "epoch": 0.33560454241816967, "grad_norm": 8.626047256669759, "learning_rate": 8.400863688854596e-07, "logits/chosen": -0.8120739459991455, "logits/rejected": -0.8196284770965576, "logps/chosen": -347.4595947265625, "logps/rejected": -357.397705078125, "loss": 0.6446, "rewards/accuracies": 0.625, "rewards/chosen": -1.0381972789764404, "rewards/margins": 0.1279696524143219, "rewards/rejected": -1.1661670207977295, "step": 157 }, { "epoch": 0.3377421509686039, "grad_norm": 11.860996272985476, "learning_rate": 8.373352729660372e-07, "logits/chosen": -0.7756985425949097, "logits/rejected": -0.7191120386123657, "logps/chosen": -395.55401611328125, "logps/rejected": -403.5904541015625, "loss": 0.6526, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2536171674728394, "rewards/margins": 0.026868807151913643, "rewards/rejected": -1.280485987663269, "step": 158 }, { "epoch": 0.3398797595190381, "grad_norm": 9.122376865267006, "learning_rate": 8.34565303179429e-07, "logits/chosen": -0.8109874725341797, "logits/rejected": -0.784782886505127, "logps/chosen": -349.2673645019531, "logps/rejected": -355.31427001953125, "loss": 0.6482, "rewards/accuracies": 0.5, "rewards/chosen": -1.1418871879577637, "rewards/margins": 0.0541604682803154, "rewards/rejected": -1.1960475444793701, "step": 159 }, { "epoch": 0.3420173680694723, "grad_norm": 10.446536418824028, "learning_rate": 8.317766145051057e-07, "logits/chosen": -0.8555909395217896, "logits/rejected": -0.8299651145935059, "logps/chosen": -393.7392272949219, "logps/rejected": -433.85565185546875, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": -1.2185660600662231, "rewards/margins": 0.22462578117847443, "rewards/rejected": -1.4431917667388916, "step": 160 }, { "epoch": 0.34415497661990646, "grad_norm": 7.825411706225049, "learning_rate": 8.289693629698563e-07, "logits/chosen": -0.7958833575248718, "logits/rejected": -0.8027774095535278, "logps/chosen": -402.79913330078125, "logps/rejected": -437.49591064453125, "loss": 0.6203, "rewards/accuracies": 0.75, "rewards/chosen": -1.1367416381835938, "rewards/margins": 0.3376201391220093, "rewards/rejected": -1.4743616580963135, "step": 161 }, { "epoch": 0.34629258517034067, "grad_norm": 9.034553362218846, "learning_rate": 8.261437056390606e-07, "logits/chosen": -0.697302520275116, "logits/rejected": -0.6625763773918152, "logps/chosen": -349.05950927734375, "logps/rejected": -353.0817565917969, "loss": 0.6857, "rewards/accuracies": 0.5, "rewards/chosen": -0.9678500294685364, "rewards/margins": 0.10605783760547638, "rewards/rejected": -1.0739078521728516, "step": 162 }, { "epoch": 0.3484301937207749, "grad_norm": 8.737777630064887, "learning_rate": 8.232998006078997e-07, "logits/chosen": -0.674803614616394, "logits/rejected": -0.6823403835296631, "logps/chosen": -358.0148620605469, "logps/rejected": -384.6661071777344, "loss": 0.6235, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1485981941223145, "rewards/margins": 0.25151392817497253, "rewards/rejected": -1.4001121520996094, "step": 163 }, { "epoch": 0.3505678022712091, "grad_norm": 9.343701031382219, "learning_rate": 8.20437806992512e-07, "logits/chosen": -0.7436198592185974, "logits/rejected": -0.7431969046592712, "logps/chosen": -316.6277770996094, "logps/rejected": -367.4931335449219, "loss": 0.6664, "rewards/accuracies": 0.75, "rewards/chosen": -1.0274879932403564, "rewards/margins": 0.1997983604669571, "rewards/rejected": -1.2272862195968628, "step": 164 }, { "epoch": 0.3527054108216433, "grad_norm": 8.418205426089232, "learning_rate": 8.175578849210894e-07, "logits/chosen": -0.7993863224983215, "logits/rejected": -0.7827702164649963, "logps/chosen": -393.60791015625, "logps/rejected": -424.71124267578125, "loss": 0.6392, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1842068433761597, "rewards/margins": 0.3191227316856384, "rewards/rejected": -1.5033295154571533, "step": 165 }, { "epoch": 0.35484301937207746, "grad_norm": 8.984910438057955, "learning_rate": 8.146601955249187e-07, "logits/chosen": -0.7122502326965332, "logits/rejected": -0.7099603414535522, "logps/chosen": -365.7021179199219, "logps/rejected": -365.78912353515625, "loss": 0.6637, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2422032356262207, "rewards/margins": -0.04617507755756378, "rewards/rejected": -1.196028232574463, "step": 166 }, { "epoch": 0.3569806279225117, "grad_norm": 7.926871474687121, "learning_rate": 8.117449009293668e-07, "logits/chosen": -0.7609111666679382, "logits/rejected": -0.7424649000167847, "logps/chosen": -367.951416015625, "logps/rejected": -388.4793395996094, "loss": 0.6288, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1471714973449707, "rewards/margins": 0.1946582943201065, "rewards/rejected": -1.341829776763916, "step": 167 }, { "epoch": 0.3591182364729459, "grad_norm": 8.370915442021108, "learning_rate": 8.088121642448089e-07, "logits/chosen": -0.7230314016342163, "logits/rejected": -0.7338634729385376, "logps/chosen": -383.22216796875, "logps/rejected": -422.01068115234375, "loss": 0.6387, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0255863666534424, "rewards/margins": 0.5024391412734985, "rewards/rejected": -1.528025507926941, "step": 168 }, { "epoch": 0.3612558450233801, "grad_norm": 8.13714962488371, "learning_rate": 8.058621495575031e-07, "logits/chosen": -0.6844447255134583, "logits/rejected": -0.6487768888473511, "logps/chosen": -350.7132568359375, "logps/rejected": -367.8271484375, "loss": 0.6105, "rewards/accuracies": 0.65625, "rewards/chosen": -1.101859450340271, "rewards/margins": 0.21480971574783325, "rewards/rejected": -1.316669225692749, "step": 169 }, { "epoch": 0.3633934535738143, "grad_norm": 9.892000882662467, "learning_rate": 8.028950219204099e-07, "logits/chosen": -0.5773683190345764, "logits/rejected": -0.5765209794044495, "logps/chosen": -370.47796630859375, "logps/rejected": -415.53167724609375, "loss": 0.5997, "rewards/accuracies": 0.75, "rewards/chosen": -1.1410350799560547, "rewards/margins": 0.3918205499649048, "rewards/rejected": -1.532855749130249, "step": 170 }, { "epoch": 0.3655310621242485, "grad_norm": 8.106890489682133, "learning_rate": 7.999109473439569e-07, "logits/chosen": -0.6529942154884338, "logits/rejected": -0.6343085169792175, "logps/chosen": -358.777099609375, "logps/rejected": -388.01995849609375, "loss": 0.6249, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1139953136444092, "rewards/margins": 0.19699575006961823, "rewards/rejected": -1.3109909296035767, "step": 171 }, { "epoch": 0.3676686706746827, "grad_norm": 7.818768691894028, "learning_rate": 7.969100927867507e-07, "logits/chosen": -0.7647715210914612, "logits/rejected": -0.768187940120697, "logps/chosen": -315.3676452636719, "logps/rejected": -344.0660095214844, "loss": 0.6095, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9741207361221313, "rewards/margins": 0.2204282581806183, "rewards/rejected": -1.1945490837097168, "step": 172 }, { "epoch": 0.3698062792251169, "grad_norm": 8.22923262916057, "learning_rate": 7.938926261462365e-07, "logits/chosen": -0.7851884961128235, "logits/rejected": -0.8041540384292603, "logps/chosen": -318.61712646484375, "logps/rejected": -398.345458984375, "loss": 0.5961, "rewards/accuracies": 0.75, "rewards/chosen": -0.9148141741752625, "rewards/margins": 0.5934224128723145, "rewards/rejected": -1.5082364082336426, "step": 173 }, { "epoch": 0.3719438877755511, "grad_norm": 8.000153942367305, "learning_rate": 7.908587162493028e-07, "logits/chosen": -0.6852933168411255, "logits/rejected": -0.6849787831306458, "logps/chosen": -406.8155212402344, "logps/rejected": -447.3205261230469, "loss": 0.6301, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2369593381881714, "rewards/margins": 0.23439320921897888, "rewards/rejected": -1.4713525772094727, "step": 174 }, { "epoch": 0.3740814963259853, "grad_norm": 9.316195243095242, "learning_rate": 7.878085328428368e-07, "logits/chosen": -0.7409089803695679, "logits/rejected": -0.7157390713691711, "logps/chosen": -338.6581115722656, "logps/rejected": -357.2071838378906, "loss": 0.646, "rewards/accuracies": 0.625, "rewards/chosen": -1.051304817199707, "rewards/margins": 0.1540244072675705, "rewards/rejected": -1.2053292989730835, "step": 175 }, { "epoch": 0.3762191048764195, "grad_norm": 9.548217777892644, "learning_rate": 7.84742246584226e-07, "logits/chosen": -0.644868016242981, "logits/rejected": -0.6358535885810852, "logps/chosen": -280.54644775390625, "logps/rejected": -320.94866943359375, "loss": 0.6304, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9811806082725525, "rewards/margins": 0.36237311363220215, "rewards/rejected": -1.3435536623001099, "step": 176 }, { "epoch": 0.37835671342685373, "grad_norm": 8.213256746442323, "learning_rate": 7.81660029031811e-07, "logits/chosen": -0.7351135015487671, "logits/rejected": -0.7099937796592712, "logps/chosen": -403.18609619140625, "logps/rejected": -427.598876953125, "loss": 0.6286, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3992477655410767, "rewards/margins": 0.21843519806861877, "rewards/rejected": -1.6176831722259521, "step": 177 }, { "epoch": 0.3804943219772879, "grad_norm": 9.877087816575154, "learning_rate": 7.785620526352861e-07, "logits/chosen": -0.6065413355827332, "logits/rejected": -0.6187620759010315, "logps/chosen": -417.3489074707031, "logps/rejected": -418.964599609375, "loss": 0.6396, "rewards/accuracies": 0.625, "rewards/chosen": -1.4968540668487549, "rewards/margins": 0.15608513355255127, "rewards/rejected": -1.6529392004013062, "step": 178 }, { "epoch": 0.3826319305277221, "grad_norm": 9.518985768520599, "learning_rate": 7.754484907260512e-07, "logits/chosen": -0.6335625648498535, "logits/rejected": -0.6501979231834412, "logps/chosen": -320.66973876953125, "logps/rejected": -377.0364990234375, "loss": 0.6203, "rewards/accuracies": 0.59375, "rewards/chosen": -0.9643224477767944, "rewards/margins": 0.4927278459072113, "rewards/rejected": -1.4570502042770386, "step": 179 }, { "epoch": 0.3847695390781563, "grad_norm": 8.327661579909856, "learning_rate": 7.723195175075135e-07, "logits/chosen": -0.8008890748023987, "logits/rejected": -0.8070433735847473, "logps/chosen": -385.35711669921875, "logps/rejected": -417.4620056152344, "loss": 0.63, "rewards/accuracies": 0.75, "rewards/chosen": -1.1224894523620605, "rewards/margins": 0.2396533489227295, "rewards/rejected": -1.36214280128479, "step": 180 }, { "epoch": 0.3869071476285905, "grad_norm": 8.177222796415196, "learning_rate": 7.691753080453411e-07, "logits/chosen": -0.7654060125350952, "logits/rejected": -0.7563324570655823, "logps/chosen": -372.6927185058594, "logps/rejected": -392.412841796875, "loss": 0.6178, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2625975608825684, "rewards/margins": 0.1758994460105896, "rewards/rejected": -1.4384969472885132, "step": 181 }, { "epoch": 0.38904475617902473, "grad_norm": 8.268029500490163, "learning_rate": 7.660160382576683e-07, "logits/chosen": -0.8044633865356445, "logits/rejected": -0.8295111060142517, "logps/chosen": -387.167724609375, "logps/rejected": -421.73223876953125, "loss": 0.6057, "rewards/accuracies": 0.75, "rewards/chosen": -1.2140891551971436, "rewards/margins": 0.26494261622428894, "rewards/rejected": -1.4790318012237549, "step": 182 }, { "epoch": 0.39118236472945894, "grad_norm": 8.205646872076233, "learning_rate": 7.628418849052523e-07, "logits/chosen": -0.7259032726287842, "logits/rejected": -0.7147877812385559, "logps/chosen": -332.19952392578125, "logps/rejected": -358.916748046875, "loss": 0.6335, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2255278825759888, "rewards/margins": 0.18311724066734314, "rewards/rejected": -1.4086451530456543, "step": 183 }, { "epoch": 0.3933199732798931, "grad_norm": 9.273642724003066, "learning_rate": 7.596530255815845e-07, "logits/chosen": -0.6111272573471069, "logits/rejected": -0.6174825429916382, "logps/chosen": -431.119140625, "logps/rejected": -474.2237854003906, "loss": 0.5914, "rewards/accuracies": 0.6875, "rewards/chosen": -1.340221643447876, "rewards/margins": 0.47385597229003906, "rewards/rejected": -1.814077615737915, "step": 184 }, { "epoch": 0.3954575818303273, "grad_norm": 9.03001535554559, "learning_rate": 7.564496387029531e-07, "logits/chosen": -0.5710060000419617, "logits/rejected": -0.6027272343635559, "logps/chosen": -402.5880432128906, "logps/rejected": -461.81390380859375, "loss": 0.5815, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2832955121994019, "rewards/margins": 0.44037097692489624, "rewards/rejected": -1.7236665487289429, "step": 185 }, { "epoch": 0.3975951903807615, "grad_norm": 10.65885585737553, "learning_rate": 7.532319034984614e-07, "logits/chosen": -0.6792325377464294, "logits/rejected": -0.7070844769477844, "logps/chosen": -345.3462219238281, "logps/rejected": -380.2834167480469, "loss": 0.606, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0898866653442383, "rewards/margins": 0.2850308120250702, "rewards/rejected": -1.3749175071716309, "step": 186 }, { "epoch": 0.39973279893119573, "grad_norm": 10.832365903487103, "learning_rate": 7.5e-07, "logits/chosen": -0.6209002733230591, "logits/rejected": -0.5837200880050659, "logps/chosen": -448.4324035644531, "logps/rejected": -443.7789306640625, "loss": 0.6723, "rewards/accuracies": 0.46875, "rewards/chosen": -1.5713595151901245, "rewards/margins": 0.1559199094772339, "rewards/rejected": -1.7272793054580688, "step": 187 }, { "epoch": 0.40187040748162994, "grad_norm": 8.78650176694853, "learning_rate": 7.467541090321733e-07, "logits/chosen": -0.6626260876655579, "logits/rejected": -0.6681480407714844, "logps/chosen": -357.17535400390625, "logps/rejected": -392.3631591796875, "loss": 0.6327, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0751440525054932, "rewards/margins": 0.28642958402633667, "rewards/rejected": -1.361573576927185, "step": 188 }, { "epoch": 0.40400801603206415, "grad_norm": 9.268270452493919, "learning_rate": 7.434944122021836e-07, "logits/chosen": -0.7080458402633667, "logits/rejected": -0.6918138861656189, "logps/chosen": -428.0231628417969, "logps/rejected": -447.84588623046875, "loss": 0.5866, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2396382093429565, "rewards/margins": 0.2672892212867737, "rewards/rejected": -1.506927490234375, "step": 189 }, { "epoch": 0.4061456245824983, "grad_norm": 9.083796097067385, "learning_rate": 7.402210918896689e-07, "logits/chosen": -0.6990772485733032, "logits/rejected": -0.6821334362030029, "logps/chosen": -330.4400329589844, "logps/rejected": -351.8786926269531, "loss": 0.5944, "rewards/accuracies": 0.59375, "rewards/chosen": -1.078837275505066, "rewards/margins": 0.31049594283103943, "rewards/rejected": -1.3893331289291382, "step": 190 }, { "epoch": 0.4082832331329325, "grad_norm": 8.342069569715447, "learning_rate": 7.369343312364993e-07, "logits/chosen": -0.6898236870765686, "logits/rejected": -0.7303708791732788, "logps/chosen": -365.74688720703125, "logps/rejected": -406.60125732421875, "loss": 0.5822, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3478041887283325, "rewards/margins": 0.3217250108718872, "rewards/rejected": -1.6695290803909302, "step": 191 }, { "epoch": 0.41042084168336673, "grad_norm": 11.11112446563951, "learning_rate": 7.33634314136531e-07, "logits/chosen": -0.567010223865509, "logits/rejected": -0.5823702812194824, "logps/chosen": -351.3297119140625, "logps/rejected": -352.91400146484375, "loss": 0.6731, "rewards/accuracies": 0.65625, "rewards/chosen": -1.202454924583435, "rewards/margins": 0.1386478990316391, "rewards/rejected": -1.3411028385162354, "step": 192 }, { "epoch": 0.41255845023380094, "grad_norm": 10.094586281846308, "learning_rate": 7.303212252253161e-07, "logits/chosen": -0.6867839694023132, "logits/rejected": -0.631986677646637, "logps/chosen": -446.3096008300781, "logps/rejected": -481.1722717285156, "loss": 0.5804, "rewards/accuracies": 0.75, "rewards/chosen": -1.420650601387024, "rewards/margins": 0.3154638409614563, "rewards/rejected": -1.7361143827438354, "step": 193 }, { "epoch": 0.41469605878423516, "grad_norm": 9.22745603420781, "learning_rate": 7.269952498697734e-07, "logits/chosen": -0.6122913360595703, "logits/rejected": -0.5846338868141174, "logps/chosen": -404.2279052734375, "logps/rejected": -479.53546142578125, "loss": 0.5926, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4300614595413208, "rewards/margins": 0.682074785232544, "rewards/rejected": -2.112136125564575, "step": 194 }, { "epoch": 0.4168336673346693, "grad_norm": 8.922810963128454, "learning_rate": 7.236565741578162e-07, "logits/chosen": -0.7965989112854004, "logits/rejected": -0.8105958104133606, "logps/chosen": -412.70068359375, "logps/rejected": -459.9786376953125, "loss": 0.6098, "rewards/accuracies": 0.75, "rewards/chosen": -1.1933541297912598, "rewards/margins": 0.4532015025615692, "rewards/rejected": -1.6465556621551514, "step": 195 }, { "epoch": 0.4189712758851035, "grad_norm": 9.70951721797377, "learning_rate": 7.203053848879418e-07, "logits/chosen": -0.66545569896698, "logits/rejected": -0.6426224708557129, "logps/chosen": -417.62750244140625, "logps/rejected": -446.0390625, "loss": 0.6345, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4516615867614746, "rewards/margins": 0.234949991106987, "rewards/rejected": -1.6866116523742676, "step": 196 }, { "epoch": 0.42110888443553773, "grad_norm": 10.834268818449964, "learning_rate": 7.16941869558779e-07, "logits/chosen": -0.6952583193778992, "logits/rejected": -0.6965677738189697, "logps/chosen": -447.6587829589844, "logps/rejected": -491.00872802734375, "loss": 0.6368, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6263762712478638, "rewards/margins": 0.3232609033584595, "rewards/rejected": -1.9496371746063232, "step": 197 }, { "epoch": 0.42324649298597194, "grad_norm": 10.734655546374897, "learning_rate": 7.135662163585984e-07, "logits/chosen": -0.7219685316085815, "logits/rejected": -0.7239058613777161, "logps/chosen": -379.8273620605469, "logps/rejected": -416.1621398925781, "loss": 0.6473, "rewards/accuracies": 0.75, "rewards/chosen": -1.4015758037567139, "rewards/margins": 0.279184490442276, "rewards/rejected": -1.6807602643966675, "step": 198 }, { "epoch": 0.42538410153640616, "grad_norm": 10.226763136881486, "learning_rate": 7.101786141547828e-07, "logits/chosen": -0.6653244495391846, "logits/rejected": -0.6480982303619385, "logps/chosen": -388.545166015625, "logps/rejected": -400.65447998046875, "loss": 0.6346, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3625783920288086, "rewards/margins": 0.17150306701660156, "rewards/rejected": -1.5340813398361206, "step": 199 }, { "epoch": 0.42752171008684037, "grad_norm": 11.800159452188982, "learning_rate": 7.067792524832603e-07, "logits/chosen": -0.802920401096344, "logits/rejected": -0.7953581213951111, "logps/chosen": -469.48583984375, "logps/rejected": -514.489501953125, "loss": 0.6935, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7296805381774902, "rewards/margins": 0.4207393229007721, "rewards/rejected": -2.1504197120666504, "step": 200 }, { "epoch": 0.42752171008684037, "eval_logits/chosen": -0.6628897190093994, "eval_logits/rejected": -0.6649256348609924, "eval_logps/chosen": -392.1436767578125, "eval_logps/rejected": -424.3627624511719, "eval_loss": 0.635185182094574, "eval_rewards/accuracies": 0.6544715166091919, "eval_rewards/chosen": -1.3659569025039673, "eval_rewards/margins": 0.2651316225528717, "eval_rewards/rejected": -1.6310884952545166, "eval_runtime": 376.3857, "eval_samples_per_second": 5.21, "eval_steps_per_second": 0.327, "step": 200 }, { "epoch": 0.4296593186372745, "grad_norm": 10.509050348979823, "learning_rate": 7.033683215379002e-07, "logits/chosen": -0.7490158081054688, "logits/rejected": -0.7795702219009399, "logps/chosen": -444.27264404296875, "logps/rejected": -450.5096435546875, "loss": 0.6259, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6222877502441406, "rewards/margins": 0.0773380845785141, "rewards/rejected": -1.699625849723816, "step": 201 }, { "epoch": 0.43179692718770873, "grad_norm": 9.361779380994284, "learning_rate": 6.999460121598704e-07, "logits/chosen": -0.8867595195770264, "logits/rejected": -0.8778724074363708, "logps/chosen": -395.88262939453125, "logps/rejected": -424.9254455566406, "loss": 0.6199, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3475773334503174, "rewards/margins": 0.2797107696533203, "rewards/rejected": -1.6272879838943481, "step": 202 }, { "epoch": 0.43393453573814295, "grad_norm": 10.465713404951545, "learning_rate": 6.965125158269618e-07, "logits/chosen": -0.7478022575378418, "logits/rejected": -0.7213735580444336, "logps/chosen": -375.4535217285156, "logps/rejected": -400.4565734863281, "loss": 0.6452, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3242757320404053, "rewards/margins": 0.18743321299552917, "rewards/rejected": -1.5117088556289673, "step": 203 }, { "epoch": 0.43607214428857716, "grad_norm": 9.542403717436502, "learning_rate": 6.93068024642873e-07, "logits/chosen": -0.7434294819831848, "logits/rejected": -0.7202074527740479, "logps/chosen": -367.4134216308594, "logps/rejected": -395.12396240234375, "loss": 0.6408, "rewards/accuracies": 0.65625, "rewards/chosen": -1.321439266204834, "rewards/margins": 0.3816969692707062, "rewards/rejected": -1.7031362056732178, "step": 204 }, { "epoch": 0.43820975283901137, "grad_norm": 11.318421364351005, "learning_rate": 6.896127313264642e-07, "logits/chosen": -0.6576538681983948, "logits/rejected": -0.6434054374694824, "logps/chosen": -381.1850280761719, "logps/rejected": -385.73736572265625, "loss": 0.6267, "rewards/accuracies": 0.5625, "rewards/chosen": -1.294406771659851, "rewards/margins": 0.12336639314889908, "rewards/rejected": -1.4177730083465576, "step": 205 }, { "epoch": 0.4403473613894456, "grad_norm": 8.514452329680676, "learning_rate": 6.861468292009726e-07, "logits/chosen": -0.652076780796051, "logits/rejected": -0.6382969617843628, "logps/chosen": -392.5809326171875, "logps/rejected": -430.596923828125, "loss": 0.6304, "rewards/accuracies": 0.6875, "rewards/chosen": -1.44416344165802, "rewards/margins": 0.37998878955841064, "rewards/rejected": -1.8241522312164307, "step": 206 }, { "epoch": 0.44248496993987974, "grad_norm": 9.679027742003948, "learning_rate": 6.826705121831976e-07, "logits/chosen": -0.7171617746353149, "logits/rejected": -0.7156708240509033, "logps/chosen": -378.35528564453125, "logps/rejected": -411.4539489746094, "loss": 0.6376, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3946869373321533, "rewards/margins": 0.37037399411201477, "rewards/rejected": -1.7650609016418457, "step": 207 }, { "epoch": 0.44462257849031395, "grad_norm": 10.610288706227443, "learning_rate": 6.7918397477265e-07, "logits/chosen": -0.6665509939193726, "logits/rejected": -0.6577183604240417, "logps/chosen": -365.9376525878906, "logps/rejected": -362.8846740722656, "loss": 0.6653, "rewards/accuracies": 0.5, "rewards/chosen": -1.1479219198226929, "rewards/margins": 0.01849663257598877, "rewards/rejected": -1.1664186716079712, "step": 208 }, { "epoch": 0.44676018704074816, "grad_norm": 9.112597710939323, "learning_rate": 6.756874120406714e-07, "logits/chosen": -0.6265541315078735, "logits/rejected": -0.61783766746521, "logps/chosen": -381.3807067871094, "logps/rejected": -425.4930725097656, "loss": 0.6119, "rewards/accuracies": 0.75, "rewards/chosen": -1.280937671661377, "rewards/margins": 0.3031711280345917, "rewards/rejected": -1.584108829498291, "step": 209 }, { "epoch": 0.44889779559118237, "grad_norm": 9.390736537982283, "learning_rate": 6.721810196195174e-07, "logits/chosen": -0.7654869556427002, "logits/rejected": -0.7667275071144104, "logps/chosen": -417.27069091796875, "logps/rejected": -449.1034851074219, "loss": 0.6217, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3596787452697754, "rewards/margins": 0.3042774796485901, "rewards/rejected": -1.6639561653137207, "step": 210 }, { "epoch": 0.4510354041416166, "grad_norm": 9.394078254466548, "learning_rate": 6.68664993691415e-07, "logits/chosen": -0.6547084450721741, "logits/rejected": -0.647241473197937, "logps/chosen": -336.94915771484375, "logps/rejected": -371.03515625, "loss": 0.6312, "rewards/accuracies": 0.75, "rewards/chosen": -0.9777745604515076, "rewards/margins": 0.2829311490058899, "rewards/rejected": -1.2607057094573975, "step": 211 }, { "epoch": 0.4531730126920508, "grad_norm": 8.760414806290829, "learning_rate": 6.651395309775836e-07, "logits/chosen": -0.6064110398292542, "logits/rejected": -0.5819242000579834, "logps/chosen": -353.7124938964844, "logps/rejected": -384.0793151855469, "loss": 0.5966, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1342616081237793, "rewards/margins": 0.3782859742641449, "rewards/rejected": -1.512547492980957, "step": 212 }, { "epoch": 0.45531062124248495, "grad_norm": 9.331952485323354, "learning_rate": 6.6160482872723e-07, "logits/chosen": -0.6409544944763184, "logits/rejected": -0.6478085517883301, "logps/chosen": -374.2773742675781, "logps/rejected": -397.2945861816406, "loss": 0.6342, "rewards/accuracies": 0.625, "rewards/chosen": -1.2687841653823853, "rewards/margins": 0.11245452612638474, "rewards/rejected": -1.3812386989593506, "step": 213 }, { "epoch": 0.45744822979291916, "grad_norm": 9.844190008748196, "learning_rate": 6.580610847065123e-07, "logits/chosen": -0.6078667640686035, "logits/rejected": -0.60109543800354, "logps/chosen": -357.74810791015625, "logps/rejected": -391.06268310546875, "loss": 0.614, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1224141120910645, "rewards/margins": 0.23654705286026, "rewards/rejected": -1.3589611053466797, "step": 214 }, { "epoch": 0.45958583834335337, "grad_norm": 9.317047438854233, "learning_rate": 6.545084971874736e-07, "logits/chosen": -0.608707845211029, "logits/rejected": -0.6254767775535583, "logps/chosen": -340.4634094238281, "logps/rejected": -377.37152099609375, "loss": 0.655, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0063505172729492, "rewards/margins": 0.29992133378982544, "rewards/rejected": -1.3062719106674194, "step": 215 }, { "epoch": 0.4617234468937876, "grad_norm": 9.52121536372048, "learning_rate": 6.509472649369509e-07, "logits/chosen": -0.642886221408844, "logits/rejected": -0.6272940039634705, "logps/chosen": -324.8238525390625, "logps/rejected": -367.4193115234375, "loss": 0.5939, "rewards/accuracies": 0.625, "rewards/chosen": -0.9792121052742004, "rewards/margins": 0.3688339293003082, "rewards/rejected": -1.3480459451675415, "step": 216 }, { "epoch": 0.4638610554442218, "grad_norm": 10.890742309360663, "learning_rate": 6.473775872054521e-07, "logits/chosen": -0.6968441009521484, "logits/rejected": -0.6998182535171509, "logps/chosen": -425.0888977050781, "logps/rejected": -457.17889404296875, "loss": 0.6358, "rewards/accuracies": 0.59375, "rewards/chosen": -1.394580364227295, "rewards/margins": 0.2801092267036438, "rewards/rejected": -1.6746896505355835, "step": 217 }, { "epoch": 0.465998663994656, "grad_norm": 8.973438938365845, "learning_rate": 6.437996637160086e-07, "logits/chosen": -0.6339977979660034, "logits/rejected": -0.605747401714325, "logps/chosen": -359.0996398925781, "logps/rejected": -398.10284423828125, "loss": 0.6284, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1331509351730347, "rewards/margins": 0.29103800654411316, "rewards/rejected": -1.4241892099380493, "step": 218 }, { "epoch": 0.46813627254509016, "grad_norm": 9.024594454350343, "learning_rate": 6.402136946530014e-07, "logits/chosen": -0.6726840734481812, "logits/rejected": -0.6727656722068787, "logps/chosen": -411.4464111328125, "logps/rejected": -438.1039733886719, "loss": 0.6074, "rewards/accuracies": 0.71875, "rewards/chosen": -1.137448787689209, "rewards/margins": 0.2949088513851166, "rewards/rejected": -1.4323575496673584, "step": 219 }, { "epoch": 0.47027388109552437, "grad_norm": 9.318954884923675, "learning_rate": 6.3661988065096e-07, "logits/chosen": -0.5828653573989868, "logits/rejected": -0.5879778861999512, "logps/chosen": -416.6065673828125, "logps/rejected": -447.0008544921875, "loss": 0.6169, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4899258613586426, "rewards/margins": 0.2851335406303406, "rewards/rejected": -1.7750593423843384, "step": 220 }, { "epoch": 0.4724114896459586, "grad_norm": 10.91663306855251, "learning_rate": 6.330184227833375e-07, "logits/chosen": -0.6656166315078735, "logits/rejected": -0.654589056968689, "logps/chosen": -380.12811279296875, "logps/rejected": -417.3984069824219, "loss": 0.5782, "rewards/accuracies": 0.875, "rewards/chosen": -1.2443169355392456, "rewards/margins": 0.4823899269104004, "rewards/rejected": -1.726706862449646, "step": 221 }, { "epoch": 0.4745490981963928, "grad_norm": 9.66304655835996, "learning_rate": 6.294095225512604e-07, "logits/chosen": -0.6804403066635132, "logits/rejected": -0.6730751395225525, "logps/chosen": -391.51995849609375, "logps/rejected": -434.95928955078125, "loss": 0.6007, "rewards/accuracies": 0.6875, "rewards/chosen": -1.164958119392395, "rewards/margins": 0.4750928580760956, "rewards/rejected": -1.640051007270813, "step": 222 }, { "epoch": 0.476686706746827, "grad_norm": 11.307540321918372, "learning_rate": 6.257933818722542e-07, "logits/chosen": -0.6279383301734924, "logits/rejected": -0.6163449883460999, "logps/chosen": -376.4117736816406, "logps/rejected": -397.33917236328125, "loss": 0.6871, "rewards/accuracies": 0.4375, "rewards/chosen": -1.320064663887024, "rewards/margins": 0.12475023418664932, "rewards/rejected": -1.444814920425415, "step": 223 }, { "epoch": 0.4788243152972612, "grad_norm": 9.673767465041793, "learning_rate": 6.22170203068947e-07, "logits/chosen": -0.711574912071228, "logits/rejected": -0.6971991062164307, "logps/chosen": -370.3948059082031, "logps/rejected": -394.70379638671875, "loss": 0.594, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2323672771453857, "rewards/margins": 0.2632126808166504, "rewards/rejected": -1.4955798387527466, "step": 224 }, { "epoch": 0.48096192384769537, "grad_norm": 14.301565196390225, "learning_rate": 6.185401888577487e-07, "logits/chosen": -0.7201038599014282, "logits/rejected": -0.713502049446106, "logps/chosen": -411.15997314453125, "logps/rejected": -440.4834289550781, "loss": 0.6286, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4549378156661987, "rewards/margins": 0.26348626613616943, "rewards/rejected": -1.7184242010116577, "step": 225 }, { "epoch": 0.4830995323981296, "grad_norm": 10.44824838559519, "learning_rate": 6.149035423375098e-07, "logits/chosen": -0.7044095993041992, "logits/rejected": -0.7011440992355347, "logps/chosen": -394.7225341796875, "logps/rejected": -418.9303283691406, "loss": 0.6385, "rewards/accuracies": 0.5, "rewards/chosen": -1.2195916175842285, "rewards/margins": 0.20536328852176666, "rewards/rejected": -1.424954891204834, "step": 226 }, { "epoch": 0.4852371409485638, "grad_norm": 11.00631388790137, "learning_rate": 6.112604669781572e-07, "logits/chosen": -0.735901951789856, "logits/rejected": -0.6977694034576416, "logps/chosen": -438.9553527832031, "logps/rejected": -447.6878662109375, "loss": 0.6141, "rewards/accuracies": 0.53125, "rewards/chosen": -1.478360891342163, "rewards/margins": 0.119898721575737, "rewards/rejected": -1.5982595682144165, "step": 227 }, { "epoch": 0.487374749498998, "grad_norm": 10.507160088155747, "learning_rate": 6.07611166609311e-07, "logits/chosen": -0.7429340481758118, "logits/rejected": -0.7295467257499695, "logps/chosen": -430.9995422363281, "logps/rejected": -448.747314453125, "loss": 0.6533, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4200453758239746, "rewards/margins": 0.17433959245681763, "rewards/rejected": -1.594385027885437, "step": 228 }, { "epoch": 0.4895123580494322, "grad_norm": 8.307584591306474, "learning_rate": 6.039558454088795e-07, "logits/chosen": -0.6406713128089905, "logits/rejected": -0.6399562358856201, "logps/chosen": -332.7983703613281, "logps/rejected": -353.19384765625, "loss": 0.5913, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1042307615280151, "rewards/margins": 0.19572903215885162, "rewards/rejected": -1.2999597787857056, "step": 229 }, { "epoch": 0.4916499665998664, "grad_norm": 11.994876856372567, "learning_rate": 6.002947078916364e-07, "logits/chosen": -0.6426191926002502, "logits/rejected": -0.6602756977081299, "logps/chosen": -344.1719665527344, "logps/rejected": -356.783935546875, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": -1.2262554168701172, "rewards/margins": 0.10306321084499359, "rewards/rejected": -1.329318642616272, "step": 230 }, { "epoch": 0.4937875751503006, "grad_norm": 10.289938408873015, "learning_rate": 5.966279588977766e-07, "logits/chosen": -0.7598620653152466, "logits/rejected": -0.7735162377357483, "logps/chosen": -382.27630615234375, "logps/rejected": -393.7611083984375, "loss": 0.6243, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1877317428588867, "rewards/margins": 0.17938965559005737, "rewards/rejected": -1.3671213388442993, "step": 231 }, { "epoch": 0.4959251837007348, "grad_norm": 10.32285872025184, "learning_rate": 5.929558035814574e-07, "logits/chosen": -0.5800771713256836, "logits/rejected": -0.5892568826675415, "logps/chosen": -364.911376953125, "logps/rejected": -363.5468444824219, "loss": 0.6196, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3333511352539062, "rewards/margins": 0.1056426540017128, "rewards/rejected": -1.4389936923980713, "step": 232 }, { "epoch": 0.498062792251169, "grad_norm": 10.3934909690253, "learning_rate": 5.892784473993183e-07, "logits/chosen": -0.6197159290313721, "logits/rejected": -0.6411285400390625, "logps/chosen": -372.03424072265625, "logps/rejected": -401.731201171875, "loss": 0.5626, "rewards/accuracies": 0.71875, "rewards/chosen": -1.235286831855774, "rewards/margins": 0.3155067563056946, "rewards/rejected": -1.5507938861846924, "step": 233 }, { "epoch": 0.5002004008016032, "grad_norm": 10.257979100996899, "learning_rate": 5.855960960989876e-07, "logits/chosen": -0.7090120911598206, "logits/rejected": -0.6980421543121338, "logps/chosen": -328.9789123535156, "logps/rejected": -349.19036865234375, "loss": 0.6148, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2482969760894775, "rewards/margins": 0.14759615063667297, "rewards/rejected": -1.3958930969238281, "step": 234 }, { "epoch": 0.5023380093520374, "grad_norm": 8.317098893301642, "learning_rate": 5.819089557075688e-07, "logits/chosen": -0.7996770739555359, "logits/rejected": -0.7929503917694092, "logps/chosen": -331.7429504394531, "logps/rejected": -369.83746337890625, "loss": 0.6066, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9831193089485168, "rewards/margins": 0.4169498383998871, "rewards/rejected": -1.400068998336792, "step": 235 }, { "epoch": 0.5044756179024716, "grad_norm": 10.217065084123991, "learning_rate": 5.782172325201155e-07, "logits/chosen": -0.6208564043045044, "logits/rejected": -0.635725200176239, "logps/chosen": -344.1796875, "logps/rejected": -389.52923583984375, "loss": 0.6368, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1277637481689453, "rewards/margins": 0.37036919593811035, "rewards/rejected": -1.4981330633163452, "step": 236 }, { "epoch": 0.5066132264529059, "grad_norm": 10.269160973282458, "learning_rate": 5.745211330880872e-07, "logits/chosen": -0.7708931565284729, "logits/rejected": -0.76704341173172, "logps/chosen": -433.10064697265625, "logps/rejected": -450.0901184082031, "loss": 0.6314, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3207013607025146, "rewards/margins": 0.2793017327785492, "rewards/rejected": -1.6000031232833862, "step": 237 }, { "epoch": 0.5087508350033401, "grad_norm": 10.399405697134318, "learning_rate": 5.708208642077945e-07, "logits/chosen": -0.6624871492385864, "logits/rejected": -0.6546816825866699, "logps/chosen": -333.7100524902344, "logps/rejected": -369.9173278808594, "loss": 0.6294, "rewards/accuracies": 0.75, "rewards/chosen": -1.1885484457015991, "rewards/margins": 0.3089646100997925, "rewards/rejected": -1.4975128173828125, "step": 238 }, { "epoch": 0.5108884435537742, "grad_norm": 9.80964346078248, "learning_rate": 5.671166329088277e-07, "logits/chosen": -0.7182386517524719, "logits/rejected": -0.7258840203285217, "logps/chosen": -356.9850769042969, "logps/rejected": -380.86285400390625, "loss": 0.6235, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3426018953323364, "rewards/margins": 0.24537137150764465, "rewards/rejected": -1.5879731178283691, "step": 239 }, { "epoch": 0.5130260521042084, "grad_norm": 10.018454517912645, "learning_rate": 5.634086464424742e-07, "logits/chosen": -0.6738543510437012, "logits/rejected": -0.6593906283378601, "logps/chosen": -359.072021484375, "logps/rejected": -400.2255554199219, "loss": 0.6674, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3493218421936035, "rewards/margins": 0.24295195937156677, "rewards/rejected": -1.5922737121582031, "step": 240 }, { "epoch": 0.5151636606546426, "grad_norm": 9.883809912496002, "learning_rate": 5.596971122701221e-07, "logits/chosen": -0.8064689636230469, "logits/rejected": -0.777323305606842, "logps/chosen": -383.2707214355469, "logps/rejected": -394.1632995605469, "loss": 0.6133, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2935426235198975, "rewards/margins": 0.17417016625404358, "rewards/rejected": -1.4677127599716187, "step": 241 }, { "epoch": 0.5173012692050768, "grad_norm": 9.835505308989157, "learning_rate": 5.559822380516539e-07, "logits/chosen": -0.74181067943573, "logits/rejected": -0.76103276014328, "logps/chosen": -413.28607177734375, "logps/rejected": -432.421142578125, "loss": 0.6135, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6156772375106812, "rewards/margins": 0.052678730338811874, "rewards/rejected": -1.668355941772461, "step": 242 }, { "epoch": 0.519438877755511, "grad_norm": 10.044057697511136, "learning_rate": 5.522642316338268e-07, "logits/chosen": -0.7109071016311646, "logits/rejected": -0.738073468208313, "logps/chosen": -371.0440673828125, "logps/rejected": -417.84588623046875, "loss": 0.6108, "rewards/accuracies": 0.75, "rewards/chosen": -1.2593053579330444, "rewards/margins": 0.29793858528137207, "rewards/rejected": -1.557244062423706, "step": 243 }, { "epoch": 0.5215764863059452, "grad_norm": 9.78032272573038, "learning_rate": 5.48543301038644e-07, "logits/chosen": -0.8035364747047424, "logits/rejected": -0.817245364189148, "logps/chosen": -408.1662292480469, "logps/rejected": -431.6890869140625, "loss": 0.657, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3148820400238037, "rewards/margins": 0.27137643098831177, "rewards/rejected": -1.5862585306167603, "step": 244 }, { "epoch": 0.5237140948563794, "grad_norm": 11.286066879084709, "learning_rate": 5.448196544517167e-07, "logits/chosen": -0.8000929355621338, "logits/rejected": -0.7960721254348755, "logps/chosen": -348.828369140625, "logps/rejected": -370.3882751464844, "loss": 0.6377, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3299049139022827, "rewards/margins": 0.21843267977237701, "rewards/rejected": -1.5483375787734985, "step": 245 }, { "epoch": 0.5258517034068136, "grad_norm": 9.544361896918447, "learning_rate": 5.410935002106152e-07, "logits/chosen": -0.7660020589828491, "logits/rejected": -0.7475563883781433, "logps/chosen": -402.910400390625, "logps/rejected": -406.4446105957031, "loss": 0.6237, "rewards/accuracies": 0.65625, "rewards/chosen": -1.283268690109253, "rewards/margins": 0.24972115457057953, "rewards/rejected": -1.5329898595809937, "step": 246 }, { "epoch": 0.5279893119572479, "grad_norm": 9.1575605917451, "learning_rate": 5.373650467932121e-07, "logits/chosen": -0.741169273853302, "logits/rejected": -0.7101236581802368, "logps/chosen": -353.3587951660156, "logps/rejected": -394.3509521484375, "loss": 0.5927, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2977474927902222, "rewards/margins": 0.445009708404541, "rewards/rejected": -1.7427570819854736, "step": 247 }, { "epoch": 0.5301269205076821, "grad_norm": 10.605879178884328, "learning_rate": 5.336345028060199e-07, "logits/chosen": -0.735455334186554, "logits/rejected": -0.7146904468536377, "logps/chosen": -415.8868103027344, "logps/rejected": -471.6510925292969, "loss": 0.6285, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4680719375610352, "rewards/margins": 0.37900200486183167, "rewards/rejected": -1.8470739126205444, "step": 248 }, { "epoch": 0.5322645290581163, "grad_norm": 9.468418276322426, "learning_rate": 5.299020769725171e-07, "logits/chosen": -0.6703728437423706, "logits/rejected": -0.6554571986198425, "logps/chosen": -414.7881164550781, "logps/rejected": -444.9212646484375, "loss": 0.6305, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5948419570922852, "rewards/margins": 0.24465849995613098, "rewards/rejected": -1.8395004272460938, "step": 249 }, { "epoch": 0.5344021376085505, "grad_norm": 10.105402066971774, "learning_rate": 5.26167978121472e-07, "logits/chosen": -0.6142255663871765, "logits/rejected": -0.5848169922828674, "logps/chosen": -390.851806640625, "logps/rejected": -430.1876220703125, "loss": 0.6119, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5544801950454712, "rewards/margins": 0.30869632959365845, "rewards/rejected": -1.8631765842437744, "step": 250 }, { "epoch": 0.5365397461589846, "grad_norm": 9.810217626805535, "learning_rate": 5.224324151752575e-07, "logits/chosen": -0.6183363795280457, "logits/rejected": -0.6150676608085632, "logps/chosen": -367.5179443359375, "logps/rejected": -416.97332763671875, "loss": 0.6083, "rewards/accuracies": 0.75, "rewards/chosen": -1.2182056903839111, "rewards/margins": 0.5702115893363953, "rewards/rejected": -1.7884173393249512, "step": 251 }, { "epoch": 0.5386773547094188, "grad_norm": 11.63997005510131, "learning_rate": 5.18695597138163e-07, "logits/chosen": -0.7786095142364502, "logits/rejected": -0.7649445533752441, "logps/chosen": -406.415771484375, "logps/rejected": -438.0752868652344, "loss": 0.6444, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4087908267974854, "rewards/margins": 0.3740866780281067, "rewards/rejected": -1.7828774452209473, "step": 252 }, { "epoch": 0.540814963259853, "grad_norm": 8.971335597918381, "learning_rate": 5.149577330846992e-07, "logits/chosen": -0.722287118434906, "logits/rejected": -0.7298377752304077, "logps/chosen": -385.11602783203125, "logps/rejected": -462.1266784667969, "loss": 0.5901, "rewards/accuracies": 0.625, "rewards/chosen": -1.3030569553375244, "rewards/margins": 0.4734255373477936, "rewards/rejected": -1.7764827013015747, "step": 253 }, { "epoch": 0.5429525718102872, "grad_norm": 10.226248494849832, "learning_rate": 5.112190321479025e-07, "logits/chosen": -0.7946709990501404, "logits/rejected": -0.7953794598579407, "logps/chosen": -365.50604248046875, "logps/rejected": -393.0849914550781, "loss": 0.6099, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3220914602279663, "rewards/margins": 0.16996119916439056, "rewards/rejected": -1.492052674293518, "step": 254 }, { "epoch": 0.5450901803607214, "grad_norm": 9.70395445393845, "learning_rate": 5.074797035076318e-07, "logits/chosen": -0.8279726505279541, "logits/rejected": -0.8029213547706604, "logps/chosen": -363.36956787109375, "logps/rejected": -353.39300537109375, "loss": 0.5921, "rewards/accuracies": 0.46875, "rewards/chosen": -1.4433786869049072, "rewards/margins": 0.08160518109798431, "rewards/rejected": -1.5249840021133423, "step": 255 }, { "epoch": 0.5472277889111556, "grad_norm": 10.09145149251664, "learning_rate": 5.037399563788664e-07, "logits/chosen": -0.6333373785018921, "logits/rejected": -0.6277045011520386, "logps/chosen": -363.3057861328125, "logps/rejected": -414.02874755859375, "loss": 0.5775, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2467057704925537, "rewards/margins": 0.4376518726348877, "rewards/rejected": -1.6843575239181519, "step": 256 }, { "epoch": 0.5493653974615899, "grad_norm": 10.227202697175395, "learning_rate": 5e-07, "logits/chosen": -0.7193889021873474, "logits/rejected": -0.7205474376678467, "logps/chosen": -384.7895812988281, "logps/rejected": -409.3086242675781, "loss": 0.6474, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4086096286773682, "rewards/margins": 0.14859981834888458, "rewards/rejected": -1.5572093725204468, "step": 257 }, { "epoch": 0.5515030060120241, "grad_norm": 10.455321411037115, "learning_rate": 4.962600436211335e-07, "logits/chosen": -0.7665015459060669, "logits/rejected": -0.751805305480957, "logps/chosen": -353.6752624511719, "logps/rejected": -387.00494384765625, "loss": 0.6357, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2000305652618408, "rewards/margins": 0.3206770420074463, "rewards/rejected": -1.5207074880599976, "step": 258 }, { "epoch": 0.5536406145624583, "grad_norm": 10.67800131716867, "learning_rate": 4.925202964923683e-07, "logits/chosen": -0.67658931016922, "logits/rejected": -0.6737143397331238, "logps/chosen": -357.2424011230469, "logps/rejected": -380.38238525390625, "loss": 0.5989, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1178100109100342, "rewards/margins": 0.21349495649337769, "rewards/rejected": -1.3313050270080566, "step": 259 }, { "epoch": 0.5557782231128925, "grad_norm": 10.755754910243839, "learning_rate": 4.887809678520975e-07, "logits/chosen": -0.7121912240982056, "logits/rejected": -0.6941719055175781, "logps/chosen": -311.152587890625, "logps/rejected": -341.55682373046875, "loss": 0.579, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0979379415512085, "rewards/margins": 0.32470834255218506, "rewards/rejected": -1.4226462841033936, "step": 260 }, { "epoch": 0.5579158316633267, "grad_norm": 10.029694987313983, "learning_rate": 4.850422669153009e-07, "logits/chosen": -0.7704156041145325, "logits/rejected": -0.7731869220733643, "logps/chosen": -433.1751403808594, "logps/rejected": -475.2624206542969, "loss": 0.6159, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5628148317337036, "rewards/margins": 0.27969640493392944, "rewards/rejected": -1.842511534690857, "step": 261 }, { "epoch": 0.5600534402137608, "grad_norm": 9.985077304371496, "learning_rate": 4.813044028618372e-07, "logits/chosen": -0.655546247959137, "logits/rejected": -0.5991637110710144, "logps/chosen": -311.8508605957031, "logps/rejected": -352.53912353515625, "loss": 0.6149, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1894161701202393, "rewards/margins": 0.4629126489162445, "rewards/rejected": -1.6523289680480957, "step": 262 }, { "epoch": 0.562191048764195, "grad_norm": 11.42063256185086, "learning_rate": 4.775675848247427e-07, "logits/chosen": -0.7124533653259277, "logits/rejected": -0.7007814645767212, "logps/chosen": -349.7750549316406, "logps/rejected": -395.9293518066406, "loss": 0.606, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2669329643249512, "rewards/margins": 0.35066768527030945, "rewards/rejected": -1.617600679397583, "step": 263 }, { "epoch": 0.5643286573146292, "grad_norm": 10.025460802753425, "learning_rate": 4.7383202187852804e-07, "logits/chosen": -0.6652883887290955, "logits/rejected": -0.6626304388046265, "logps/chosen": -350.379150390625, "logps/rejected": -388.03955078125, "loss": 0.5997, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4017623662948608, "rewards/margins": 0.309231698513031, "rewards/rejected": -1.710994005203247, "step": 264 }, { "epoch": 0.5664662658650634, "grad_norm": 10.653930828086093, "learning_rate": 4.700979230274829e-07, "logits/chosen": -0.7248793244361877, "logits/rejected": -0.750960648059845, "logps/chosen": -394.911865234375, "logps/rejected": -443.53643798828125, "loss": 0.6179, "rewards/accuracies": 0.65625, "rewards/chosen": -1.381069540977478, "rewards/margins": 0.36748361587524414, "rewards/rejected": -1.7485532760620117, "step": 265 }, { "epoch": 0.5686038744154976, "grad_norm": 9.87511145308802, "learning_rate": 4.6636549719398016e-07, "logits/chosen": -0.7590113878250122, "logits/rejected": -0.7530328035354614, "logps/chosen": -422.1754150390625, "logps/rejected": -463.882080078125, "loss": 0.5906, "rewards/accuracies": 0.75, "rewards/chosen": -1.4037176370620728, "rewards/margins": 0.3301146626472473, "rewards/rejected": -1.7338322401046753, "step": 266 }, { "epoch": 0.5707414829659319, "grad_norm": 10.40705399730886, "learning_rate": 4.626349532067879e-07, "logits/chosen": -0.5113621950149536, "logits/rejected": -0.4636048972606659, "logps/chosen": -402.9996337890625, "logps/rejected": -432.8628845214844, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": -1.4809210300445557, "rewards/margins": 0.3719174861907959, "rewards/rejected": -1.8528385162353516, "step": 267 }, { "epoch": 0.5728790915163661, "grad_norm": 9.883415102033252, "learning_rate": 4.5890649978938487e-07, "logits/chosen": -0.7086624503135681, "logits/rejected": -0.6735981702804565, "logps/chosen": -396.3412170410156, "logps/rejected": -393.7690734863281, "loss": 0.5721, "rewards/accuracies": 0.625, "rewards/chosen": -1.459097146987915, "rewards/margins": 0.10202471911907196, "rewards/rejected": -1.561121940612793, "step": 268 }, { "epoch": 0.5750167000668003, "grad_norm": 11.674476580668774, "learning_rate": 4.5518034554828327e-07, "logits/chosen": -0.7449507117271423, "logits/rejected": -0.722856879234314, "logps/chosen": -426.8054504394531, "logps/rejected": -444.621337890625, "loss": 0.6042, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3702492713928223, "rewards/margins": 0.2847437858581543, "rewards/rejected": -1.6549930572509766, "step": 269 }, { "epoch": 0.5771543086172345, "grad_norm": 10.79085847307073, "learning_rate": 4.514566989613559e-07, "logits/chosen": -0.7816205024719238, "logits/rejected": -0.7831264734268188, "logps/chosen": -380.7841796875, "logps/rejected": -405.67108154296875, "loss": 0.6769, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4601352214813232, "rewards/margins": 0.22311216592788696, "rewards/rejected": -1.6832473278045654, "step": 270 }, { "epoch": 0.5792919171676687, "grad_norm": 13.754123888989874, "learning_rate": 4.477357683661733e-07, "logits/chosen": -0.6621173620223999, "logits/rejected": -0.6234359741210938, "logps/chosen": -376.8826599121094, "logps/rejected": -421.74981689453125, "loss": 0.6674, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3359074592590332, "rewards/margins": 0.37606099247932434, "rewards/rejected": -1.7119684219360352, "step": 271 }, { "epoch": 0.5814295257181029, "grad_norm": 10.684496603394274, "learning_rate": 4.4401776194834603e-07, "logits/chosen": -0.7525122761726379, "logits/rejected": -0.6963589787483215, "logps/chosen": -329.6082458496094, "logps/rejected": -376.0239562988281, "loss": 0.6287, "rewards/accuracies": 0.75, "rewards/chosen": -1.1713266372680664, "rewards/margins": 0.28129494190216064, "rewards/rejected": -1.452621579170227, "step": 272 }, { "epoch": 0.5835671342685371, "grad_norm": 9.352339379386558, "learning_rate": 4.403028877298779e-07, "logits/chosen": -0.6548051238059998, "logits/rejected": -0.632011890411377, "logps/chosen": -384.2966003417969, "logps/rejected": -421.78839111328125, "loss": 0.622, "rewards/accuracies": 0.59375, "rewards/chosen": -1.369025707244873, "rewards/margins": 0.33843424916267395, "rewards/rejected": -1.707459807395935, "step": 273 }, { "epoch": 0.5857047428189712, "grad_norm": 9.078233343454654, "learning_rate": 4.3659135355752593e-07, "logits/chosen": -0.6783146858215332, "logits/rejected": -0.6960130929946899, "logps/chosen": -353.8924560546875, "logps/rejected": -399.84918212890625, "loss": 0.5956, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3400187492370605, "rewards/margins": 0.32464563846588135, "rewards/rejected": -1.664664387702942, "step": 274 }, { "epoch": 0.5878423513694054, "grad_norm": 15.480458345502447, "learning_rate": 4.328833670911724e-07, "logits/chosen": -0.651485025882721, "logits/rejected": -0.6426280736923218, "logps/chosen": -407.6331481933594, "logps/rejected": -407.97271728515625, "loss": 0.6394, "rewards/accuracies": 0.65625, "rewards/chosen": -1.497727394104004, "rewards/margins": 0.0166710764169693, "rewards/rejected": -1.514398455619812, "step": 275 }, { "epoch": 0.5899799599198396, "grad_norm": 9.317803722726895, "learning_rate": 4.2917913579220553e-07, "logits/chosen": -0.7354484796524048, "logits/rejected": -0.7279876470565796, "logps/chosen": -336.7724914550781, "logps/rejected": -337.7842712402344, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -1.3154979944229126, "rewards/margins": 0.18782049417495728, "rewards/rejected": -1.5033185482025146, "step": 276 }, { "epoch": 0.5921175684702739, "grad_norm": 10.981393496040226, "learning_rate": 4.254788669119127e-07, "logits/chosen": -0.6517477631568909, "logits/rejected": -0.6439751386642456, "logps/chosen": -398.1854553222656, "logps/rejected": -394.73992919921875, "loss": 0.6151, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5254887342453003, "rewards/margins": 0.09052658081054688, "rewards/rejected": -1.6160151958465576, "step": 277 }, { "epoch": 0.5942551770207081, "grad_norm": 9.363953745062531, "learning_rate": 4.2178276747988444e-07, "logits/chosen": -0.7151267528533936, "logits/rejected": -0.6989988088607788, "logps/chosen": -399.211669921875, "logps/rejected": -472.8134765625, "loss": 0.6081, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5025238990783691, "rewards/margins": 0.6458090543746948, "rewards/rejected": -2.1483330726623535, "step": 278 }, { "epoch": 0.5963927855711423, "grad_norm": 10.996768453375289, "learning_rate": 4.180910442924311e-07, "logits/chosen": -0.6743846535682678, "logits/rejected": -0.6869890093803406, "logps/chosen": -349.3891296386719, "logps/rejected": -385.591064453125, "loss": 0.6559, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1990691423416138, "rewards/margins": 0.24665698409080505, "rewards/rejected": -1.4457261562347412, "step": 279 }, { "epoch": 0.5985303941215765, "grad_norm": 12.076546659241137, "learning_rate": 4.144039039010124e-07, "logits/chosen": -0.7634164094924927, "logits/rejected": -0.7913932204246521, "logps/chosen": -363.1887512207031, "logps/rejected": -416.5007019042969, "loss": 0.599, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0668609142303467, "rewards/margins": 0.44077447056770325, "rewards/rejected": -1.507635474205017, "step": 280 }, { "epoch": 0.6006680026720107, "grad_norm": 11.14137973799164, "learning_rate": 4.107215526006817e-07, "logits/chosen": -0.7002226114273071, "logits/rejected": -0.7134915590286255, "logps/chosen": -370.8570556640625, "logps/rejected": -408.5722961425781, "loss": 0.664, "rewards/accuracies": 0.625, "rewards/chosen": -1.3522167205810547, "rewards/margins": 0.2257714569568634, "rewards/rejected": -1.5779881477355957, "step": 281 }, { "epoch": 0.6028056112224449, "grad_norm": 10.43225088057876, "learning_rate": 4.070441964185427e-07, "logits/chosen": -0.6937713623046875, "logits/rejected": -0.6445334553718567, "logps/chosen": -320.21636962890625, "logps/rejected": -386.27337646484375, "loss": 0.6365, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0563011169433594, "rewards/margins": 0.5464246869087219, "rewards/rejected": -1.6027257442474365, "step": 282 }, { "epoch": 0.6049432197728791, "grad_norm": 9.579570581028333, "learning_rate": 4.0337204110222347e-07, "logits/chosen": -0.7348592281341553, "logits/rejected": -0.7190099954605103, "logps/chosen": -368.09918212890625, "logps/rejected": -410.2267761230469, "loss": 0.6029, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1717023849487305, "rewards/margins": 0.4082415699958801, "rewards/rejected": -1.5799440145492554, "step": 283 }, { "epoch": 0.6070808283233133, "grad_norm": 10.220765953116597, "learning_rate": 3.997052921083636e-07, "logits/chosen": -0.6168830394744873, "logits/rejected": -0.6260079145431519, "logps/chosen": -374.4775695800781, "logps/rejected": -405.28546142578125, "loss": 0.6056, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2549736499786377, "rewards/margins": 0.39645275473594666, "rewards/rejected": -1.6514263153076172, "step": 284 }, { "epoch": 0.6092184368737475, "grad_norm": 11.754142515548534, "learning_rate": 3.960441545911204e-07, "logits/chosen": -0.7724018096923828, "logits/rejected": -0.8003143668174744, "logps/chosen": -411.28765869140625, "logps/rejected": -446.0718688964844, "loss": 0.6254, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3750414848327637, "rewards/margins": 0.19460612535476685, "rewards/rejected": -1.5696475505828857, "step": 285 }, { "epoch": 0.6113560454241816, "grad_norm": 11.71195693335506, "learning_rate": 3.92388833390689e-07, "logits/chosen": -0.6072220206260681, "logits/rejected": -0.5882732272148132, "logps/chosen": -362.8934020996094, "logps/rejected": -384.33685302734375, "loss": 0.6421, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4378963708877563, "rewards/margins": 0.3072332739830017, "rewards/rejected": -1.7451298236846924, "step": 286 }, { "epoch": 0.6134936539746159, "grad_norm": 10.423314767059496, "learning_rate": 3.8873953302184283e-07, "logits/chosen": -0.6478594541549683, "logits/rejected": -0.6148996949195862, "logps/chosen": -402.24993896484375, "logps/rejected": -419.41510009765625, "loss": 0.6184, "rewards/accuracies": 0.65625, "rewards/chosen": -1.573075532913208, "rewards/margins": 0.27030453085899353, "rewards/rejected": -1.8433799743652344, "step": 287 }, { "epoch": 0.6156312625250501, "grad_norm": 11.589829757981947, "learning_rate": 3.8509645766249034e-07, "logits/chosen": -0.7512708902359009, "logits/rejected": -0.7593178749084473, "logps/chosen": -430.9858093261719, "logps/rejected": -473.86578369140625, "loss": 0.6066, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5662776231765747, "rewards/margins": 0.3962094187736511, "rewards/rejected": -1.9624871015548706, "step": 288 }, { "epoch": 0.6177688710754843, "grad_norm": 38.79630588602357, "learning_rate": 3.814598111422513e-07, "logits/chosen": -0.7107813358306885, "logits/rejected": -0.7043961882591248, "logps/chosen": -359.62713623046875, "logps/rejected": -373.19805908203125, "loss": 0.6213, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3326873779296875, "rewards/margins": 0.10174018889665604, "rewards/rejected": -1.4344274997711182, "step": 289 }, { "epoch": 0.6199064796259185, "grad_norm": 9.198178073702046, "learning_rate": 3.778297969310529e-07, "logits/chosen": -0.7122032046318054, "logits/rejected": -0.7226367592811584, "logps/chosen": -360.8456726074219, "logps/rejected": -395.7742614746094, "loss": 0.6057, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3359217643737793, "rewards/margins": 0.3005616068840027, "rewards/rejected": -1.6364833116531372, "step": 290 }, { "epoch": 0.6220440881763527, "grad_norm": 11.585726416356431, "learning_rate": 3.742066181277457e-07, "logits/chosen": -0.6904798150062561, "logits/rejected": -0.6984922885894775, "logps/chosen": -385.777587890625, "logps/rejected": -417.1224670410156, "loss": 0.602, "rewards/accuracies": 0.75, "rewards/chosen": -1.4644272327423096, "rewards/margins": 0.26233839988708496, "rewards/rejected": -1.7267656326293945, "step": 291 }, { "epoch": 0.6241816967267869, "grad_norm": 10.98477533754328, "learning_rate": 3.7059047744873955e-07, "logits/chosen": -0.6717097759246826, "logits/rejected": -0.6137974262237549, "logps/chosen": -388.8412780761719, "logps/rejected": -408.81964111328125, "loss": 0.6424, "rewards/accuracies": 0.625, "rewards/chosen": -1.4904993772506714, "rewards/margins": 0.2106323540210724, "rewards/rejected": -1.701131820678711, "step": 292 }, { "epoch": 0.6263193052772211, "grad_norm": 11.098357379348672, "learning_rate": 3.669815772166625e-07, "logits/chosen": -0.7643608450889587, "logits/rejected": -0.7616855502128601, "logps/chosen": -399.7235412597656, "logps/rejected": -444.3169250488281, "loss": 0.5882, "rewards/accuracies": 0.75, "rewards/chosen": -1.1846121549606323, "rewards/margins": 0.3367740213871002, "rewards/rejected": -1.5213862657546997, "step": 293 }, { "epoch": 0.6284569138276553, "grad_norm": 9.823709075894158, "learning_rate": 3.6338011934904e-07, "logits/chosen": -0.7340261936187744, "logits/rejected": -0.7253273129463196, "logps/chosen": -415.0310974121094, "logps/rejected": -479.2626953125, "loss": 0.5839, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4078618288040161, "rewards/margins": 0.7003488540649414, "rewards/rejected": -2.108210802078247, "step": 294 }, { "epoch": 0.6305945223780896, "grad_norm": 10.87697682105901, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -0.6595284342765808, "logits/rejected": -0.6863126754760742, "logps/chosen": -365.39013671875, "logps/rejected": -408.86541748046875, "loss": 0.652, "rewards/accuracies": 0.71875, "rewards/chosen": -1.497340202331543, "rewards/margins": 0.30657070875167847, "rewards/rejected": -1.803910732269287, "step": 295 }, { "epoch": 0.6327321309285238, "grad_norm": 11.045318981796374, "learning_rate": 3.562003362839914e-07, "logits/chosen": -0.7206366062164307, "logits/rejected": -0.7295577526092529, "logps/chosen": -461.62225341796875, "logps/rejected": -458.39703369140625, "loss": 0.6665, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7048362493515015, "rewards/margins": 0.04338730126619339, "rewards/rejected": -1.7482235431671143, "step": 296 }, { "epoch": 0.6348697394789579, "grad_norm": 10.855323994177997, "learning_rate": 3.526224127945478e-07, "logits/chosen": -0.6919922828674316, "logits/rejected": -0.6954550743103027, "logps/chosen": -336.01556396484375, "logps/rejected": -376.9835205078125, "loss": 0.6407, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3392897844314575, "rewards/margins": 0.2275466024875641, "rewards/rejected": -1.5668363571166992, "step": 297 }, { "epoch": 0.6370073480293921, "grad_norm": 9.58788965705626, "learning_rate": 3.49052735063049e-07, "logits/chosen": -0.8874866962432861, "logits/rejected": -0.8917239904403687, "logps/chosen": -403.54693603515625, "logps/rejected": -442.32305908203125, "loss": 0.5722, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3754301071166992, "rewards/margins": 0.39617669582366943, "rewards/rejected": -1.771606683731079, "step": 298 }, { "epoch": 0.6391449565798263, "grad_norm": 10.56963230174736, "learning_rate": 3.454915028125263e-07, "logits/chosen": -0.6784321665763855, "logits/rejected": -0.6550740003585815, "logps/chosen": -406.8092041015625, "logps/rejected": -409.7593688964844, "loss": 0.6048, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4954036474227905, "rewards/margins": 0.18573154509067535, "rewards/rejected": -1.6811351776123047, "step": 299 }, { "epoch": 0.6412825651302605, "grad_norm": 10.005595422402594, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -0.7489890456199646, "logits/rejected": -0.7623311281204224, "logps/chosen": -440.9601135253906, "logps/rejected": -441.59466552734375, "loss": 0.6376, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6095386743545532, "rewards/margins": 0.23035281896591187, "rewards/rejected": -1.8398916721343994, "step": 300 }, { "epoch": 0.6412825651302605, "eval_logits/chosen": -0.6758147478103638, "eval_logits/rejected": -0.6752761006355286, "eval_logps/chosen": -390.88177490234375, "eval_logps/rejected": -425.3858947753906, "eval_loss": 0.6177628040313721, "eval_rewards/accuracies": 0.6747967600822449, "eval_rewards/chosen": -1.3533374071121216, "eval_rewards/margins": 0.28798195719718933, "eval_rewards/rejected": -1.6413193941116333, "eval_runtime": 377.299, "eval_samples_per_second": 5.197, "eval_steps_per_second": 0.326, "step": 300 }, { "epoch": 0.6434201736806947, "grad_norm": 10.329473531690297, "learning_rate": 3.3839517127277004e-07, "logits/chosen": -0.7601391673088074, "logits/rejected": -0.7844873070716858, "logps/chosen": -393.47540283203125, "logps/rejected": -435.7286071777344, "loss": 0.6387, "rewards/accuracies": 0.6875, "rewards/chosen": -1.259373426437378, "rewards/margins": 0.349065899848938, "rewards/rejected": -1.608439326286316, "step": 301 }, { "epoch": 0.6455577822311289, "grad_norm": 8.767557926350584, "learning_rate": 3.348604690224166e-07, "logits/chosen": -0.8301680088043213, "logits/rejected": -0.8203250169754028, "logps/chosen": -425.7115478515625, "logps/rejected": -468.8160095214844, "loss": 0.6113, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3414942026138306, "rewards/margins": 0.44216763973236084, "rewards/rejected": -1.7836618423461914, "step": 302 }, { "epoch": 0.6476953907815631, "grad_norm": 11.252600228651138, "learning_rate": 3.31335006308585e-07, "logits/chosen": -0.7533825635910034, "logits/rejected": -0.732757031917572, "logps/chosen": -393.85040283203125, "logps/rejected": -415.14080810546875, "loss": 0.6301, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3789258003234863, "rewards/margins": 0.23148328065872192, "rewards/rejected": -1.6104090213775635, "step": 303 }, { "epoch": 0.6498329993319973, "grad_norm": 10.257975436900558, "learning_rate": 3.2781898038048237e-07, "logits/chosen": -0.6510428786277771, "logits/rejected": -0.6685248613357544, "logps/chosen": -390.3652038574219, "logps/rejected": -393.8785400390625, "loss": 0.6683, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5787415504455566, "rewards/margins": 0.11641066521406174, "rewards/rejected": -1.6951522827148438, "step": 304 }, { "epoch": 0.6519706078824316, "grad_norm": 9.860462354210497, "learning_rate": 3.243125879593286e-07, "logits/chosen": -0.7366013526916504, "logits/rejected": -0.7264673709869385, "logps/chosen": -361.6595458984375, "logps/rejected": -408.7252197265625, "loss": 0.6058, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1662776470184326, "rewards/margins": 0.39646124839782715, "rewards/rejected": -1.5627388954162598, "step": 305 }, { "epoch": 0.6541082164328658, "grad_norm": 11.932221586274338, "learning_rate": 3.2081602522734985e-07, "logits/chosen": -0.7773129343986511, "logits/rejected": -0.7762659788131714, "logps/chosen": -384.003662109375, "logps/rejected": -423.2783203125, "loss": 0.5892, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3898005485534668, "rewards/margins": 0.3622281551361084, "rewards/rejected": -1.7520288228988647, "step": 306 }, { "epoch": 0.6562458249833, "grad_norm": 9.369038313539917, "learning_rate": 3.173294878168025e-07, "logits/chosen": -0.6643047332763672, "logits/rejected": -0.6601549386978149, "logps/chosen": -372.4691162109375, "logps/rejected": -406.27996826171875, "loss": 0.6158, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4243800640106201, "rewards/margins": 0.29933756589889526, "rewards/rejected": -1.7237175703048706, "step": 307 }, { "epoch": 0.6583834335337342, "grad_norm": 10.15702054511366, "learning_rate": 3.138531707990274e-07, "logits/chosen": -0.6945326328277588, "logits/rejected": -0.6813417673110962, "logps/chosen": -367.9193115234375, "logps/rejected": -422.1386413574219, "loss": 0.5835, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1640138626098633, "rewards/margins": 0.5344864726066589, "rewards/rejected": -1.6985002756118774, "step": 308 }, { "epoch": 0.6605210420841683, "grad_norm": 10.774899794292791, "learning_rate": 3.1038726867353583e-07, "logits/chosen": -0.678726315498352, "logits/rejected": -0.706427276134491, "logps/chosen": -402.2789001464844, "logps/rejected": -475.16436767578125, "loss": 0.5877, "rewards/accuracies": 0.78125, "rewards/chosen": -1.296879529953003, "rewards/margins": 0.6074644327163696, "rewards/rejected": -1.904344081878662, "step": 309 }, { "epoch": 0.6626586506346025, "grad_norm": 9.326333800621224, "learning_rate": 3.069319753571269e-07, "logits/chosen": -0.7166895866394043, "logits/rejected": -0.7176540493965149, "logps/chosen": -386.4005432128906, "logps/rejected": -395.38970947265625, "loss": 0.6111, "rewards/accuracies": 0.5, "rewards/chosen": -1.5514951944351196, "rewards/margins": 0.11902564764022827, "rewards/rejected": -1.6705207824707031, "step": 310 }, { "epoch": 0.6647962591850367, "grad_norm": 11.49416505541279, "learning_rate": 3.034874841730382e-07, "logits/chosen": -0.7580830454826355, "logits/rejected": -0.7336598634719849, "logps/chosen": -402.9891052246094, "logps/rejected": -430.2671813964844, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": -1.3294634819030762, "rewards/margins": 0.3222670555114746, "rewards/rejected": -1.6517305374145508, "step": 311 }, { "epoch": 0.6669338677354709, "grad_norm": 10.887373926899288, "learning_rate": 3.000539878401296e-07, "logits/chosen": -0.6197298765182495, "logits/rejected": -0.5989848375320435, "logps/chosen": -391.74951171875, "logps/rejected": -449.1798400878906, "loss": 0.6082, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4147597551345825, "rewards/margins": 0.5554874539375305, "rewards/rejected": -1.9702472686767578, "step": 312 }, { "epoch": 0.6690714762859051, "grad_norm": 11.171229600938071, "learning_rate": 2.9663167846209996e-07, "logits/chosen": -0.6838382482528687, "logits/rejected": -0.6743027567863464, "logps/chosen": -368.8251037597656, "logps/rejected": -415.6241149902344, "loss": 0.6372, "rewards/accuracies": 0.71875, "rewards/chosen": -1.388074278831482, "rewards/margins": 0.4560723900794983, "rewards/rejected": -1.844146490097046, "step": 313 }, { "epoch": 0.6712090848363393, "grad_norm": 10.489570114197578, "learning_rate": 2.9322074751673974e-07, "logits/chosen": -0.6488001346588135, "logits/rejected": -0.6053016781806946, "logps/chosen": -422.6211853027344, "logps/rejected": -449.6383972167969, "loss": 0.6577, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6949717998504639, "rewards/margins": 0.3523138463497162, "rewards/rejected": -2.047285556793213, "step": 314 }, { "epoch": 0.6733466933867736, "grad_norm": 9.335833798624803, "learning_rate": 2.898213858452173e-07, "logits/chosen": -0.7407481670379639, "logits/rejected": -0.6984574794769287, "logps/chosen": -426.7627868652344, "logps/rejected": -433.4222106933594, "loss": 0.6038, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5585707426071167, "rewards/margins": 0.30426639318466187, "rewards/rejected": -1.8628369569778442, "step": 315 }, { "epoch": 0.6754843019372078, "grad_norm": 9.82583152531341, "learning_rate": 2.864337836414018e-07, "logits/chosen": -0.7897535562515259, "logits/rejected": -0.7509832382202148, "logps/chosen": -440.0413818359375, "logps/rejected": -473.6156311035156, "loss": 0.5877, "rewards/accuracies": 0.78125, "rewards/chosen": -1.778262972831726, "rewards/margins": 0.3270663917064667, "rewards/rejected": -2.1053295135498047, "step": 316 }, { "epoch": 0.677621910487642, "grad_norm": 10.397708781784715, "learning_rate": 2.8305813044122093e-07, "logits/chosen": -0.5974478125572205, "logits/rejected": -0.5807868242263794, "logps/chosen": -366.0530090332031, "logps/rejected": -355.85882568359375, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": -1.3970279693603516, "rewards/margins": 0.0002168789505958557, "rewards/rejected": -1.3972446918487549, "step": 317 }, { "epoch": 0.6797595190380762, "grad_norm": 10.464645314526035, "learning_rate": 2.7969461511205806e-07, "logits/chosen": -0.626457691192627, "logits/rejected": -0.5530537366867065, "logps/chosen": -330.521240234375, "logps/rejected": -358.5421142578125, "loss": 0.6146, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4030221700668335, "rewards/margins": 0.22237975895404816, "rewards/rejected": -1.6254019737243652, "step": 318 }, { "epoch": 0.6818971275885104, "grad_norm": 9.874403173091292, "learning_rate": 2.763434258421836e-07, "logits/chosen": -0.7100091576576233, "logits/rejected": -0.6709161996841431, "logps/chosen": -342.3360595703125, "logps/rejected": -356.4312744140625, "loss": 0.6294, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2349555492401123, "rewards/margins": 0.18816961348056793, "rewards/rejected": -1.4231250286102295, "step": 319 }, { "epoch": 0.6840347361389446, "grad_norm": 10.416140198085172, "learning_rate": 2.730047501302266e-07, "logits/chosen": -0.7924367785453796, "logits/rejected": -0.7890709638595581, "logps/chosen": -402.750244140625, "logps/rejected": -433.9951171875, "loss": 0.5975, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3218122720718384, "rewards/margins": 0.32942885160446167, "rewards/rejected": -1.6512411832809448, "step": 320 }, { "epoch": 0.6861723446893787, "grad_norm": 10.264424479929405, "learning_rate": 2.696787747746839e-07, "logits/chosen": -0.7326480150222778, "logits/rejected": -0.727679967880249, "logps/chosen": -335.9344177246094, "logps/rejected": -376.9026794433594, "loss": 0.6166, "rewards/accuracies": 0.625, "rewards/chosen": -1.2939304113388062, "rewards/margins": 0.3992197811603546, "rewards/rejected": -1.6931501626968384, "step": 321 }, { "epoch": 0.6883099532398129, "grad_norm": 9.672418793392822, "learning_rate": 2.6636568586346897e-07, "logits/chosen": -0.7330962419509888, "logits/rejected": -0.7231791615486145, "logps/chosen": -344.6290588378906, "logps/rejected": -368.3528137207031, "loss": 0.6236, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1829473972320557, "rewards/margins": 0.23583151400089264, "rewards/rejected": -1.4187790155410767, "step": 322 }, { "epoch": 0.6904475617902471, "grad_norm": 12.909094410970068, "learning_rate": 2.6306566876350067e-07, "logits/chosen": -0.7223283648490906, "logits/rejected": -0.6862327456474304, "logps/chosen": -427.14727783203125, "logps/rejected": -453.6234436035156, "loss": 0.5843, "rewards/accuracies": 0.625, "rewards/chosen": -1.5469043254852295, "rewards/margins": 0.278840035200119, "rewards/rejected": -1.825744390487671, "step": 323 }, { "epoch": 0.6925851703406813, "grad_norm": 13.707807531422917, "learning_rate": 2.597789081103313e-07, "logits/chosen": -0.7629610300064087, "logits/rejected": -0.727975070476532, "logps/chosen": -382.91278076171875, "logps/rejected": -421.9703369140625, "loss": 0.5563, "rewards/accuracies": 0.875, "rewards/chosen": -1.4632904529571533, "rewards/margins": 0.4552845358848572, "rewards/rejected": -1.9185751676559448, "step": 324 }, { "epoch": 0.6947227788911156, "grad_norm": 10.588462296925226, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -0.621019184589386, "logits/rejected": -0.5743827223777771, "logps/chosen": -433.55267333984375, "logps/rejected": -461.3088073730469, "loss": 0.6159, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6322648525238037, "rewards/margins": 0.2886176109313965, "rewards/rejected": -1.9208825826644897, "step": 325 }, { "epoch": 0.6968603874415498, "grad_norm": 9.6025165386732, "learning_rate": 2.5324589096782656e-07, "logits/chosen": -0.6759508848190308, "logits/rejected": -0.6631283760070801, "logps/chosen": -414.1610107421875, "logps/rejected": -420.1568603515625, "loss": 0.6298, "rewards/accuracies": 0.59375, "rewards/chosen": -1.416655421257019, "rewards/margins": 0.17679718136787415, "rewards/rejected": -1.5934526920318604, "step": 326 }, { "epoch": 0.698997995991984, "grad_norm": 12.80932336769188, "learning_rate": 2.500000000000001e-07, "logits/chosen": -0.6499335765838623, "logits/rejected": -0.662979245185852, "logps/chosen": -405.96063232421875, "logps/rejected": -447.20172119140625, "loss": 0.6288, "rewards/accuracies": 0.625, "rewards/chosen": -1.4347225427627563, "rewards/margins": 0.31777456402778625, "rewards/rejected": -1.7524970769882202, "step": 327 }, { "epoch": 0.7011356045424182, "grad_norm": 12.08919140781653, "learning_rate": 2.467680965015387e-07, "logits/chosen": -0.7271804213523865, "logits/rejected": -0.7305589914321899, "logps/chosen": -362.54632568359375, "logps/rejected": -384.875, "loss": 0.634, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2877211570739746, "rewards/margins": 0.24846753478050232, "rewards/rejected": -1.5361886024475098, "step": 328 }, { "epoch": 0.7032732130928524, "grad_norm": 10.873874736167313, "learning_rate": 2.4355036129704696e-07, "logits/chosen": -0.6805239915847778, "logits/rejected": -0.6776773929595947, "logps/chosen": -472.7155456542969, "logps/rejected": -522.7994384765625, "loss": 0.6205, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6964683532714844, "rewards/margins": 0.3618759214878082, "rewards/rejected": -2.0583443641662598, "step": 329 }, { "epoch": 0.7054108216432866, "grad_norm": 16.081627749911508, "learning_rate": 2.403469744184154e-07, "logits/chosen": -0.7133939266204834, "logits/rejected": -0.7143837809562683, "logps/chosen": -382.49078369140625, "logps/rejected": -437.5693664550781, "loss": 0.6097, "rewards/accuracies": 0.8125, "rewards/chosen": -1.266930341720581, "rewards/margins": 0.49213629961013794, "rewards/rejected": -1.7590665817260742, "step": 330 }, { "epoch": 0.7075484301937208, "grad_norm": 10.322962661870067, "learning_rate": 2.371581150947476e-07, "logits/chosen": -0.8041883707046509, "logits/rejected": -0.8093154430389404, "logps/chosen": -430.856689453125, "logps/rejected": -477.88787841796875, "loss": 0.6063, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3982198238372803, "rewards/margins": 0.3465649485588074, "rewards/rejected": -1.7447847127914429, "step": 331 }, { "epoch": 0.7096860387441549, "grad_norm": 10.446630163251449, "learning_rate": 2.3398396174233176e-07, "logits/chosen": -0.6520624160766602, "logits/rejected": -0.6437772512435913, "logps/chosen": -422.0386047363281, "logps/rejected": -486.0862731933594, "loss": 0.6174, "rewards/accuracies": 0.75, "rewards/chosen": -1.6643366813659668, "rewards/margins": 0.3572143316268921, "rewards/rejected": -2.0215511322021484, "step": 332 }, { "epoch": 0.7118236472945891, "grad_norm": 11.809866444989042, "learning_rate": 2.3082469195465893e-07, "logits/chosen": -0.7520323395729065, "logits/rejected": -0.7196107506752014, "logps/chosen": -411.33251953125, "logps/rejected": -455.638916015625, "loss": 0.5696, "rewards/accuracies": 0.625, "rewards/chosen": -1.6863501071929932, "rewards/margins": 0.3991457223892212, "rewards/rejected": -2.085495710372925, "step": 333 }, { "epoch": 0.7139612558450233, "grad_norm": 11.4706777134489, "learning_rate": 2.2768048249248644e-07, "logits/chosen": -0.6395952105522156, "logits/rejected": -0.612390398979187, "logps/chosen": -408.4999084472656, "logps/rejected": -444.7389831542969, "loss": 0.6339, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7306082248687744, "rewards/margins": 0.3128102421760559, "rewards/rejected": -2.0434184074401855, "step": 334 }, { "epoch": 0.7160988643954576, "grad_norm": 10.253605586246742, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -0.6910028457641602, "logits/rejected": -0.6887121200561523, "logps/chosen": -373.79571533203125, "logps/rejected": -457.9923095703125, "loss": 0.6146, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5869219303131104, "rewards/margins": 0.5055859088897705, "rewards/rejected": -2.0925076007843018, "step": 335 }, { "epoch": 0.7182364729458918, "grad_norm": 10.09520238328347, "learning_rate": 2.2143794736471388e-07, "logits/chosen": -0.7225451469421387, "logits/rejected": -0.7483439445495605, "logps/chosen": -484.85748291015625, "logps/rejected": -529.5263061523438, "loss": 0.6224, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8494200706481934, "rewards/margins": 0.2027570605278015, "rewards/rejected": -2.0521771907806396, "step": 336 }, { "epoch": 0.720374081496326, "grad_norm": 10.071232143800623, "learning_rate": 2.1833997096818895e-07, "logits/chosen": -0.5754382610321045, "logits/rejected": -0.5392119288444519, "logps/chosen": -344.8245544433594, "logps/rejected": -379.4691162109375, "loss": 0.6219, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2857069969177246, "rewards/margins": 0.34328341484069824, "rewards/rejected": -1.6289904117584229, "step": 337 }, { "epoch": 0.7225116900467602, "grad_norm": 10.65729584502251, "learning_rate": 2.1525775341577402e-07, "logits/chosen": -0.6606283187866211, "logits/rejected": -0.6608355045318604, "logps/chosen": -414.2405700683594, "logps/rejected": -429.601806640625, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3188872337341309, "rewards/margins": 0.19126060605049133, "rewards/rejected": -1.5101479291915894, "step": 338 }, { "epoch": 0.7246492985971944, "grad_norm": 10.538063937615522, "learning_rate": 2.121914671571633e-07, "logits/chosen": -0.7743428945541382, "logits/rejected": -0.7525985836982727, "logps/chosen": -367.4284973144531, "logps/rejected": -441.0301818847656, "loss": 0.6129, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3432848453521729, "rewards/margins": 0.6236953735351562, "rewards/rejected": -1.9669800996780396, "step": 339 }, { "epoch": 0.7267869071476286, "grad_norm": 9.94512421358411, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -0.7715132236480713, "logits/rejected": -0.7799488306045532, "logps/chosen": -431.17169189453125, "logps/rejected": -494.21197509765625, "loss": 0.5912, "rewards/accuracies": 0.625, "rewards/chosen": -1.5074207782745361, "rewards/margins": 0.48499661684036255, "rewards/rejected": -1.9924174547195435, "step": 340 }, { "epoch": 0.7289245156980628, "grad_norm": 12.36515548240608, "learning_rate": 2.0610737385376348e-07, "logits/chosen": -0.7136672139167786, "logits/rejected": -0.6798695921897888, "logps/chosen": -405.33905029296875, "logps/rejected": -437.63330078125, "loss": 0.5978, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4325660467147827, "rewards/margins": 0.23246146738529205, "rewards/rejected": -1.6650276184082031, "step": 341 }, { "epoch": 0.731062124248497, "grad_norm": 10.594170284983056, "learning_rate": 2.0308990721324926e-07, "logits/chosen": -0.6517391800880432, "logits/rejected": -0.6472780108451843, "logps/chosen": -456.90570068359375, "logps/rejected": -480.61138916015625, "loss": 0.5951, "rewards/accuracies": 0.84375, "rewards/chosen": -1.691187858581543, "rewards/margins": 0.42632347345352173, "rewards/rejected": -2.117511034011841, "step": 342 }, { "epoch": 0.7331997327989312, "grad_norm": 11.483481596708172, "learning_rate": 2.0008905265604315e-07, "logits/chosen": -0.7073544263839722, "logits/rejected": -0.6990326642990112, "logps/chosen": -409.7100524902344, "logps/rejected": -450.9366455078125, "loss": 0.5441, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6152515411376953, "rewards/margins": 0.3995630741119385, "rewards/rejected": -2.014814853668213, "step": 343 }, { "epoch": 0.7353373413493653, "grad_norm": 13.346456381620781, "learning_rate": 1.971049780795901e-07, "logits/chosen": -0.7003156542778015, "logits/rejected": -0.6687884330749512, "logps/chosen": -310.5570373535156, "logps/rejected": -344.05859375, "loss": 0.6145, "rewards/accuracies": 0.75, "rewards/chosen": -1.0297322273254395, "rewards/margins": 0.3454705476760864, "rewards/rejected": -1.3752026557922363, "step": 344 }, { "epoch": 0.7374749498997996, "grad_norm": 10.139219194086207, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -0.6944881677627563, "logits/rejected": -0.6632839441299438, "logps/chosen": -381.2460632324219, "logps/rejected": -414.730712890625, "loss": 0.5746, "rewards/accuracies": 0.6875, "rewards/chosen": -1.378089427947998, "rewards/margins": 0.35800689458847046, "rewards/rejected": -1.7360961437225342, "step": 345 }, { "epoch": 0.7396125584502338, "grad_norm": 9.84020912855359, "learning_rate": 1.9118783575519109e-07, "logits/chosen": -0.7444390058517456, "logits/rejected": -0.7687693238258362, "logps/chosen": -441.13104248046875, "logps/rejected": -471.73797607421875, "loss": 0.6159, "rewards/accuracies": 0.59375, "rewards/chosen": -1.675041675567627, "rewards/margins": 0.1502073109149933, "rewards/rejected": -1.8252489566802979, "step": 346 }, { "epoch": 0.741750167000668, "grad_norm": 11.291373399374436, "learning_rate": 1.8825509907063326e-07, "logits/chosen": -0.7405213117599487, "logits/rejected": -0.7411251068115234, "logps/chosen": -346.1521301269531, "logps/rejected": -372.3758850097656, "loss": 0.6207, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4540354013442993, "rewards/margins": 0.28074249625205994, "rewards/rejected": -1.7347780466079712, "step": 347 }, { "epoch": 0.7438877755511022, "grad_norm": 9.478462210282277, "learning_rate": 1.8533980447508135e-07, "logits/chosen": -0.7745504975318909, "logits/rejected": -0.7580114603042603, "logps/chosen": -364.4132995605469, "logps/rejected": -376.5552978515625, "loss": 0.6103, "rewards/accuracies": 0.625, "rewards/chosen": -1.342382550239563, "rewards/margins": 0.21273840963840485, "rewards/rejected": -1.5551210641860962, "step": 348 }, { "epoch": 0.7460253841015364, "grad_norm": 11.376031296146607, "learning_rate": 1.824421150789106e-07, "logits/chosen": -0.588141918182373, "logits/rejected": -0.6058573126792908, "logps/chosen": -402.21026611328125, "logps/rejected": -441.880615234375, "loss": 0.6202, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4959633350372314, "rewards/margins": 0.359602689743042, "rewards/rejected": -1.8555659055709839, "step": 349 }, { "epoch": 0.7481629926519706, "grad_norm": 9.3095000239465, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -0.7804590463638306, "logits/rejected": -0.768570601940155, "logps/chosen": -395.44970703125, "logps/rejected": -442.4303894042969, "loss": 0.5622, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3593695163726807, "rewards/margins": 0.4249880313873291, "rewards/rejected": -1.7843575477600098, "step": 350 }, { "epoch": 0.7503006012024048, "grad_norm": 10.971792168406296, "learning_rate": 1.7670019939210023e-07, "logits/chosen": -0.6696098446846008, "logits/rejected": -0.6669338941574097, "logps/chosen": -451.68768310546875, "logps/rejected": -497.6464538574219, "loss": 0.5897, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7285881042480469, "rewards/margins": 0.4477997124195099, "rewards/rejected": -2.1763877868652344, "step": 351 }, { "epoch": 0.752438209752839, "grad_norm": 11.419971323161318, "learning_rate": 1.7385629436093956e-07, "logits/chosen": -0.6907357573509216, "logits/rejected": -0.637013852596283, "logps/chosen": -432.1939392089844, "logps/rejected": -469.10162353515625, "loss": 0.6008, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7298922538757324, "rewards/margins": 0.38807937502861023, "rewards/rejected": -2.117971420288086, "step": 352 }, { "epoch": 0.7545758183032732, "grad_norm": 12.388342839443734, "learning_rate": 1.710306370301437e-07, "logits/chosen": -0.7042302489280701, "logits/rejected": -0.7210839986801147, "logps/chosen": -481.449951171875, "logps/rejected": -541.4148559570312, "loss": 0.6228, "rewards/accuracies": 0.75, "rewards/chosen": -1.7111109495162964, "rewards/margins": 0.5329866409301758, "rewards/rejected": -2.2440977096557617, "step": 353 }, { "epoch": 0.7567134268537075, "grad_norm": 10.900263207759233, "learning_rate": 1.6822338549489446e-07, "logits/chosen": -0.6276527047157288, "logits/rejected": -0.6185672879219055, "logps/chosen": -353.99462890625, "logps/rejected": -390.6813659667969, "loss": 0.5823, "rewards/accuracies": 0.71875, "rewards/chosen": -1.318265438079834, "rewards/margins": 0.3258642554283142, "rewards/rejected": -1.6441295146942139, "step": 354 }, { "epoch": 0.7588510354041417, "grad_norm": 11.630075473409493, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -0.7092128992080688, "logits/rejected": -0.6994844079017639, "logps/chosen": -449.5421142578125, "logps/rejected": -491.48406982421875, "loss": 0.5833, "rewards/accuracies": 0.625, "rewards/chosen": -1.727628231048584, "rewards/margins": 0.27954497933387756, "rewards/rejected": -2.0071730613708496, "step": 355 }, { "epoch": 0.7609886439545758, "grad_norm": 11.035433473738337, "learning_rate": 1.6266472703396284e-07, "logits/chosen": -0.801999568939209, "logits/rejected": -0.7807914614677429, "logps/chosen": -436.8575439453125, "logps/rejected": -460.9193115234375, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5792278051376343, "rewards/margins": 0.3968886435031891, "rewards/rejected": -1.976116418838501, "step": 356 }, { "epoch": 0.76312625250501, "grad_norm": 11.722081233691071, "learning_rate": 1.599136311145402e-07, "logits/chosen": -0.6747885942459106, "logits/rejected": -0.618495523929596, "logps/chosen": -422.78729248046875, "logps/rejected": -472.6419372558594, "loss": 0.6287, "rewards/accuracies": 0.59375, "rewards/chosen": -1.606202244758606, "rewards/margins": 0.4832748472690582, "rewards/rejected": -2.089477062225342, "step": 357 }, { "epoch": 0.7652638610554442, "grad_norm": 12.64820349502746, "learning_rate": 1.5718156298578288e-07, "logits/chosen": -0.7273571491241455, "logits/rejected": -0.6881564855575562, "logps/chosen": -425.4215087890625, "logps/rejected": -444.314208984375, "loss": 0.628, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6894898414611816, "rewards/margins": 0.1857471764087677, "rewards/rejected": -1.875237226486206, "step": 358 }, { "epoch": 0.7674014696058784, "grad_norm": 10.96328639456088, "learning_rate": 1.5446867550656767e-07, "logits/chosen": -0.6399669647216797, "logits/rejected": -0.6358177661895752, "logps/chosen": -372.887939453125, "logps/rejected": -401.3673095703125, "loss": 0.5816, "rewards/accuracies": 0.6875, "rewards/chosen": -1.543872356414795, "rewards/margins": 0.2215932011604309, "rewards/rejected": -1.7654657363891602, "step": 359 }, { "epoch": 0.7695390781563126, "grad_norm": 11.515812302503546, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -0.7418455481529236, "logits/rejected": -0.6992334127426147, "logps/chosen": -440.9914245605469, "logps/rejected": -484.72882080078125, "loss": 0.5918, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6546094417572021, "rewards/margins": 0.40860795974731445, "rewards/rejected": -2.0632174015045166, "step": 360 }, { "epoch": 0.7716766867067468, "grad_norm": 11.301203400803265, "learning_rate": 1.4910104855800426e-07, "logits/chosen": -0.5830298066139221, "logits/rejected": -0.541452944278717, "logps/chosen": -428.9151611328125, "logps/rejected": -450.665283203125, "loss": 0.6244, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6960718631744385, "rewards/margins": 0.26461082696914673, "rewards/rejected": -1.96068274974823, "step": 361 }, { "epoch": 0.773814295257181, "grad_norm": 10.448871439187576, "learning_rate": 1.4644660940672627e-07, "logits/chosen": -0.643266499042511, "logits/rejected": -0.6516848802566528, "logps/chosen": -382.15582275390625, "logps/rejected": -422.58001708984375, "loss": 0.6302, "rewards/accuracies": 0.6875, "rewards/chosen": -1.693763017654419, "rewards/margins": 0.31615814566612244, "rewards/rejected": -2.0099213123321533, "step": 362 }, { "epoch": 0.7759519038076153, "grad_norm": 12.065289362336179, "learning_rate": 1.4381195152432769e-07, "logits/chosen": -0.7809977531433105, "logits/rejected": -0.7569341659545898, "logps/chosen": -402.4347229003906, "logps/rejected": -426.5815124511719, "loss": 0.6014, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4668489694595337, "rewards/margins": 0.27162590622901917, "rewards/rejected": -1.7384748458862305, "step": 363 }, { "epoch": 0.7780895123580495, "grad_norm": 11.450406850059426, "learning_rate": 1.4119722231959403e-07, "logits/chosen": -0.7261683940887451, "logits/rejected": -0.7380213737487793, "logps/chosen": -320.5738830566406, "logps/rejected": -376.2132568359375, "loss": 0.6148, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1737643480300903, "rewards/margins": 0.4349837005138397, "rewards/rejected": -1.608747959136963, "step": 364 }, { "epoch": 0.7802271209084837, "grad_norm": 10.17075140486933, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -0.6793671250343323, "logits/rejected": -0.6769421100616455, "logps/chosen": -396.4522705078125, "logps/rejected": -426.5015869140625, "loss": 0.5761, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5801575183868408, "rewards/margins": 0.23419132828712463, "rewards/rejected": -1.8143486976623535, "step": 365 }, { "epoch": 0.7823647294589179, "grad_norm": 9.496259172803326, "learning_rate": 1.3602813399504458e-07, "logits/chosen": -0.7178226113319397, "logits/rejected": -0.7088046073913574, "logps/chosen": -362.4988098144531, "logps/rejected": -413.68255615234375, "loss": 0.5697, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4096518754959106, "rewards/margins": 0.4560312330722809, "rewards/rejected": -1.8656830787658691, "step": 366 }, { "epoch": 0.784502338009352, "grad_norm": 11.467183173889156, "learning_rate": 1.3347406408508694e-07, "logits/chosen": -0.58012455701828, "logits/rejected": -0.6086165308952332, "logps/chosen": -381.5002746582031, "logps/rejected": -446.1846618652344, "loss": 0.5768, "rewards/accuracies": 0.625, "rewards/chosen": -1.546051263809204, "rewards/margins": 0.5468287467956543, "rewards/rejected": -2.0928800106048584, "step": 367 }, { "epoch": 0.7866399465597862, "grad_norm": 11.770361743077546, "learning_rate": 1.3094050125632972e-07, "logits/chosen": -0.665503978729248, "logits/rejected": -0.6807020902633667, "logps/chosen": -339.297119140625, "logps/rejected": -378.72283935546875, "loss": 0.6007, "rewards/accuracies": 0.78125, "rewards/chosen": -1.239609718322754, "rewards/margins": 0.311847984790802, "rewards/rejected": -1.5514576435089111, "step": 368 }, { "epoch": 0.7887775551102204, "grad_norm": 11.239777792633861, "learning_rate": 1.284275872613028e-07, "logits/chosen": -0.7516641020774841, "logits/rejected": -0.7523844242095947, "logps/chosen": -465.70562744140625, "logps/rejected": -494.3858642578125, "loss": 0.5955, "rewards/accuracies": 0.625, "rewards/chosen": -1.708259105682373, "rewards/margins": 0.19142737984657288, "rewards/rejected": -1.899686336517334, "step": 369 }, { "epoch": 0.7909151636606546, "grad_norm": 12.24207530779827, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -0.7178503274917603, "logits/rejected": -0.7465229630470276, "logps/chosen": -350.14300537109375, "logps/rejected": -426.7923583984375, "loss": 0.5556, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3200982809066772, "rewards/margins": 0.5981042981147766, "rewards/rejected": -1.918202519416809, "step": 370 }, { "epoch": 0.7930527722110888, "grad_norm": 11.476654461821495, "learning_rate": 1.2346426699819456e-07, "logits/chosen": -0.6654431223869324, "logits/rejected": -0.6413010954856873, "logps/chosen": -432.3926086425781, "logps/rejected": -445.0782165527344, "loss": 0.6153, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8410680294036865, "rewards/margins": 0.26301610469818115, "rewards/rejected": -2.104084014892578, "step": 371 }, { "epoch": 0.795190380761523, "grad_norm": 11.015669950808952, "learning_rate": 1.2101413842727343e-07, "logits/chosen": -0.748419463634491, "logits/rejected": -0.7465101480484009, "logps/chosen": -404.2447204589844, "logps/rejected": -458.7890625, "loss": 0.6227, "rewards/accuracies": 0.75, "rewards/chosen": -1.4635306596755981, "rewards/margins": 0.4813007712364197, "rewards/rejected": -1.9448314905166626, "step": 372 }, { "epoch": 0.7973279893119573, "grad_norm": 11.968874819239444, "learning_rate": 1.1858521406886674e-07, "logits/chosen": -0.6935529112815857, "logits/rejected": -0.6768806576728821, "logps/chosen": -479.6001892089844, "logps/rejected": -526.9801025390625, "loss": 0.5949, "rewards/accuracies": 0.71875, "rewards/chosen": -2.146116018295288, "rewards/margins": 0.5019779205322266, "rewards/rejected": -2.6480939388275146, "step": 373 }, { "epoch": 0.7994655978623915, "grad_norm": 11.31673592574301, "learning_rate": 1.1617762982099444e-07, "logits/chosen": -0.7199594974517822, "logits/rejected": -0.7195298671722412, "logps/chosen": -390.56695556640625, "logps/rejected": -437.9982604980469, "loss": 0.6259, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6399167776107788, "rewards/margins": 0.4196929931640625, "rewards/rejected": -2.0596096515655518, "step": 374 }, { "epoch": 0.8016032064128257, "grad_norm": 11.65245860510705, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -0.6417936086654663, "logits/rejected": -0.5881288051605225, "logps/chosen": -462.2901611328125, "logps/rejected": -533.3080444335938, "loss": 0.639, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8019856214523315, "rewards/margins": 0.778126060962677, "rewards/rejected": -2.580111503601074, "step": 375 }, { "epoch": 0.8037408149632599, "grad_norm": 11.237123066893254, "learning_rate": 1.1142701927151454e-07, "logits/chosen": -0.742131233215332, "logits/rejected": -0.7236477136611938, "logps/chosen": -440.7339782714844, "logps/rejected": -468.85723876953125, "loss": 0.622, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7191107273101807, "rewards/margins": 0.3171979784965515, "rewards/rejected": -2.036308765411377, "step": 376 }, { "epoch": 0.8058784235136941, "grad_norm": 12.182574925046193, "learning_rate": 1.090842587659851e-07, "logits/chosen": -0.6230757832527161, "logits/rejected": -0.6275469064712524, "logps/chosen": -345.8181457519531, "logps/rejected": -382.3629150390625, "loss": 0.6094, "rewards/accuracies": 0.625, "rewards/chosen": -1.315040111541748, "rewards/margins": 0.3147667646408081, "rewards/rejected": -1.6298067569732666, "step": 377 }, { "epoch": 0.8080160320641283, "grad_norm": 12.095497229280761, "learning_rate": 1.0676336994827512e-07, "logits/chosen": -0.8505545258522034, "logits/rejected": -0.8231047987937927, "logps/chosen": -439.4098205566406, "logps/rejected": -450.57861328125, "loss": 0.5882, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8718527555465698, "rewards/margins": 0.0610598549246788, "rewards/rejected": -1.9329125881195068, "step": 378 }, { "epoch": 0.8101536406145624, "grad_norm": 12.306182802408912, "learning_rate": 1.044644826718295e-07, "logits/chosen": -0.6553314924240112, "logits/rejected": -0.6298251152038574, "logps/chosen": -428.9188537597656, "logps/rejected": -464.73126220703125, "loss": 0.5659, "rewards/accuracies": 0.6875, "rewards/chosen": -1.740647554397583, "rewards/margins": 0.34435731172561646, "rewards/rejected": -2.0850048065185547, "step": 379 }, { "epoch": 0.8122912491649966, "grad_norm": 11.84427292451934, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -0.6922661662101746, "logits/rejected": -0.7002755999565125, "logps/chosen": -382.30987548828125, "logps/rejected": -423.1776123046875, "loss": 0.6365, "rewards/accuracies": 0.78125, "rewards/chosen": -1.37299644947052, "rewards/margins": 0.28076884150505066, "rewards/rejected": -1.6537654399871826, "step": 380 }, { "epoch": 0.8144288577154308, "grad_norm": 11.204366574978794, "learning_rate": 9.99332259943969e-08, "logits/chosen": -0.7378983497619629, "logits/rejected": -0.7215259075164795, "logps/chosen": -465.00885009765625, "logps/rejected": -522.8477783203125, "loss": 0.6099, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6207081079483032, "rewards/margins": 0.5316731333732605, "rewards/rejected": -2.152381420135498, "step": 381 }, { "epoch": 0.816566466265865, "grad_norm": 10.577264704091782, "learning_rate": 9.770111011666582e-08, "logits/chosen": -0.7259981632232666, "logits/rejected": -0.7045480012893677, "logps/chosen": -428.8095703125, "logps/rejected": -492.239013671875, "loss": 0.6087, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5366865396499634, "rewards/margins": 0.7509114146232605, "rewards/rejected": -2.287597894668579, "step": 382 }, { "epoch": 0.8187040748162993, "grad_norm": 12.483677889539976, "learning_rate": 9.549150281252632e-08, "logits/chosen": -0.6887928247451782, "logits/rejected": -0.6907156705856323, "logps/chosen": -352.9273681640625, "logps/rejected": -383.6487121582031, "loss": 0.6259, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5845268964767456, "rewards/margins": 0.28942757844924927, "rewards/rejected": -1.8739545345306396, "step": 383 }, { "epoch": 0.8208416833667335, "grad_norm": 19.23190107186564, "learning_rate": 9.330452770923603e-08, "logits/chosen": -0.762394905090332, "logits/rejected": -0.7647604942321777, "logps/chosen": -451.6494140625, "logps/rejected": -534.89892578125, "loss": 0.5934, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8478323221206665, "rewards/margins": 0.6826062798500061, "rewards/rejected": -2.5304384231567383, "step": 384 }, { "epoch": 0.8229792919171677, "grad_norm": 12.247364252908152, "learning_rate": 9.114030716778432e-08, "logits/chosen": -0.7505077123641968, "logits/rejected": -0.7758923768997192, "logps/chosen": -470.6575927734375, "logps/rejected": -503.64556884765625, "loss": 0.6397, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8584275245666504, "rewards/margins": 0.3208252787590027, "rewards/rejected": -2.179252862930298, "step": 385 }, { "epoch": 0.8251169004676019, "grad_norm": 11.383815632835855, "learning_rate": 8.899896227604508e-08, "logits/chosen": -0.6819490194320679, "logits/rejected": -0.6731836199760437, "logps/chosen": -433.1306457519531, "logps/rejected": -487.12646484375, "loss": 0.6317, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6253031492233276, "rewards/margins": 0.4985421299934387, "rewards/rejected": -2.123845100402832, "step": 386 }, { "epoch": 0.8272545090180361, "grad_norm": 13.189195026919496, "learning_rate": 8.688061284200265e-08, "logits/chosen": -0.6536362171173096, "logits/rejected": -0.6316641569137573, "logps/chosen": -447.10577392578125, "logps/rejected": -500.36700439453125, "loss": 0.6544, "rewards/accuracies": 0.65625, "rewards/chosen": -1.6854324340820312, "rewards/margins": 0.3739916980266571, "rewards/rejected": -2.0594239234924316, "step": 387 }, { "epoch": 0.8293921175684703, "grad_norm": 12.05654473393893, "learning_rate": 8.478537738704811e-08, "logits/chosen": -0.7113953232765198, "logits/rejected": -0.6980003118515015, "logps/chosen": -437.1040344238281, "logps/rejected": -477.0093078613281, "loss": 0.5797, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7935041189193726, "rewards/margins": 0.35914352536201477, "rewards/rejected": -2.1526474952697754, "step": 388 }, { "epoch": 0.8315297261189045, "grad_norm": 11.698252029580289, "learning_rate": 8.271337313934867e-08, "logits/chosen": -0.624556839466095, "logits/rejected": -0.6502059698104858, "logps/chosen": -414.85882568359375, "logps/rejected": -456.1212158203125, "loss": 0.6072, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6804428100585938, "rewards/margins": 0.3041376769542694, "rewards/rejected": -1.9845805168151855, "step": 389 }, { "epoch": 0.8336673346693386, "grad_norm": 12.14771475451631, "learning_rate": 8.066471602728803e-08, "logits/chosen": -0.6798664331436157, "logits/rejected": -0.6738008260726929, "logps/chosen": -411.55047607421875, "logps/rejected": -457.5155334472656, "loss": 0.5922, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4717459678649902, "rewards/margins": 0.5215471386909485, "rewards/rejected": -1.993293046951294, "step": 390 }, { "epoch": 0.8358049432197728, "grad_norm": 12.796578845217505, "learning_rate": 7.863952067298041e-08, "logits/chosen": -0.5822688937187195, "logits/rejected": -0.564083993434906, "logps/chosen": -431.5522155761719, "logps/rejected": -450.26739501953125, "loss": 0.628, "rewards/accuracies": 0.625, "rewards/chosen": -1.9067468643188477, "rewards/margins": 0.21568900346755981, "rewards/rejected": -2.1224358081817627, "step": 391 }, { "epoch": 0.837942551770207, "grad_norm": 11.159935748642301, "learning_rate": 7.663790038585794e-08, "logits/chosen": -0.662575364112854, "logits/rejected": -0.6590286493301392, "logps/chosen": -444.98162841796875, "logps/rejected": -497.9795227050781, "loss": 0.5731, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7177234888076782, "rewards/margins": 0.6272789239883423, "rewards/rejected": -2.3450024127960205, "step": 392 }, { "epoch": 0.8400801603206413, "grad_norm": 14.31633694385083, "learning_rate": 7.465996715633027e-08, "logits/chosen": -0.6459007263183594, "logits/rejected": -0.6448737382888794, "logps/chosen": -397.7703552246094, "logps/rejected": -440.38238525390625, "loss": 0.5945, "rewards/accuracies": 0.75, "rewards/chosen": -1.7266194820404053, "rewards/margins": 0.44395869970321655, "rewards/rejected": -2.1705780029296875, "step": 393 }, { "epoch": 0.8422177688710755, "grad_norm": 11.224103010133572, "learning_rate": 7.270583164951926e-08, "logits/chosen": -0.6865531206130981, "logits/rejected": -0.6968246698379517, "logps/chosen": -354.6371154785156, "logps/rejected": -424.811279296875, "loss": 0.6334, "rewards/accuracies": 0.8125, "rewards/chosen": -1.536527156829834, "rewards/margins": 0.5009466409683228, "rewards/rejected": -2.0374739170074463, "step": 394 }, { "epoch": 0.8443553774215097, "grad_norm": 10.826285808287984, "learning_rate": 7.077560319906694e-08, "logits/chosen": -0.6569056510925293, "logits/rejected": -0.6044581532478333, "logps/chosen": -360.92681884765625, "logps/rejected": -372.06512451171875, "loss": 0.6061, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6241035461425781, "rewards/margins": 0.13333997130393982, "rewards/rejected": -1.7574434280395508, "step": 395 }, { "epoch": 0.8464929859719439, "grad_norm": 11.121731952204106, "learning_rate": 6.886938980101869e-08, "logits/chosen": -0.6959440112113953, "logits/rejected": -0.6976322531700134, "logps/chosen": -481.72747802734375, "logps/rejected": -528.6837768554688, "loss": 0.5492, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7375783920288086, "rewards/margins": 0.5362969040870667, "rewards/rejected": -2.2738752365112305, "step": 396 }, { "epoch": 0.8486305945223781, "grad_norm": 11.676543714442664, "learning_rate": 6.698729810778064e-08, "logits/chosen": -0.7131574153900146, "logits/rejected": -0.6955525875091553, "logps/chosen": -399.06610107421875, "logps/rejected": -414.4498291015625, "loss": 0.5949, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6074295043945312, "rewards/margins": 0.191550150513649, "rewards/rejected": -1.798979640007019, "step": 397 }, { "epoch": 0.8507682030728123, "grad_norm": 11.148999020679877, "learning_rate": 6.512943342215232e-08, "logits/chosen": -0.7562680244445801, "logits/rejected": -0.779510498046875, "logps/chosen": -484.00506591796875, "logps/rejected": -511.72882080078125, "loss": 0.5846, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8436009883880615, "rewards/margins": 0.35308870673179626, "rewards/rejected": -2.1966898441314697, "step": 398 }, { "epoch": 0.8529058116232465, "grad_norm": 11.997120047866387, "learning_rate": 6.329589969143517e-08, "logits/chosen": -0.6792132258415222, "logits/rejected": -0.6694210171699524, "logps/chosen": -424.2513427734375, "logps/rejected": -454.1995849609375, "loss": 0.5749, "rewards/accuracies": 0.53125, "rewards/chosen": -1.7970068454742432, "rewards/margins": 0.36387020349502563, "rewards/rejected": -2.160876750946045, "step": 399 }, { "epoch": 0.8550434201736807, "grad_norm": 11.238178437853232, "learning_rate": 6.148679950161672e-08, "logits/chosen": -0.6610137820243835, "logits/rejected": -0.6665123105049133, "logps/chosen": -446.12451171875, "logps/rejected": -491.7059326171875, "loss": 0.5888, "rewards/accuracies": 0.5625, "rewards/chosen": -2.106739044189453, "rewards/margins": 0.3179362714290619, "rewards/rejected": -2.4246749877929688, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -0.6434622406959534, "eval_logits/rejected": -0.6439588069915771, "eval_logps/chosen": -418.7559814453125, "eval_logps/rejected": -459.1051330566406, "eval_loss": 0.6088488698005676, "eval_rewards/accuracies": 0.6829268336296082, "eval_rewards/chosen": -1.6320796012878418, "eval_rewards/margins": 0.34643232822418213, "eval_rewards/rejected": -1.9785118103027344, "eval_runtime": 373.8135, "eval_samples_per_second": 5.246, "eval_steps_per_second": 0.329, "step": 400 }, { "epoch": 0.857181028724115, "grad_norm": 13.938068351492317, "learning_rate": 5.9702234071631e-08, "logits/chosen": -0.6074206233024597, "logits/rejected": -0.57494056224823, "logps/chosen": -432.34869384765625, "logps/rejected": -481.2635498046875, "loss": 0.624, "rewards/accuracies": 0.65625, "rewards/chosen": -1.724292278289795, "rewards/margins": 0.546868622303009, "rewards/rejected": -2.2711610794067383, "step": 401 }, { "epoch": 0.859318637274549, "grad_norm": 14.889197610496133, "learning_rate": 5.794230324769517e-08, "logits/chosen": -0.6924403309822083, "logits/rejected": -0.695598304271698, "logps/chosen": -430.3138732910156, "logps/rejected": -472.5992431640625, "loss": 0.6165, "rewards/accuracies": 0.59375, "rewards/chosen": -1.809744119644165, "rewards/margins": 0.3945625424385071, "rewards/rejected": -2.2043066024780273, "step": 402 }, { "epoch": 0.8614562458249833, "grad_norm": 10.24820423132373, "learning_rate": 5.620710549772295e-08, "logits/chosen": -0.6588191390037537, "logits/rejected": -0.6449538469314575, "logps/chosen": -391.6925354003906, "logps/rejected": -442.3234558105469, "loss": 0.6086, "rewards/accuracies": 0.625, "rewards/chosen": -1.5062448978424072, "rewards/margins": 0.3859245777130127, "rewards/rejected": -1.89216947555542, "step": 403 }, { "epoch": 0.8635938543754175, "grad_norm": 11.857754301901029, "learning_rate": 5.44967379058161e-08, "logits/chosen": -0.7503631114959717, "logits/rejected": -0.7300340533256531, "logps/chosen": -386.56072998046875, "logps/rejected": -396.8390808105469, "loss": 0.5982, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6606768369674683, "rewards/margins": 0.09765629470348358, "rewards/rejected": -1.7583332061767578, "step": 404 }, { "epoch": 0.8657314629258517, "grad_norm": 10.569416708562631, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -0.7464025020599365, "logits/rejected": -0.7271702885627747, "logps/chosen": -413.1631774902344, "logps/rejected": -470.1753234863281, "loss": 0.581, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5785975456237793, "rewards/margins": 0.4657081961631775, "rewards/rejected": -2.0443055629730225, "step": 405 }, { "epoch": 0.8678690714762859, "grad_norm": 11.592126458293672, "learning_rate": 5.11508745810284e-08, "logits/chosen": -0.667618453502655, "logits/rejected": -0.6744921207427979, "logps/chosen": -404.13824462890625, "logps/rejected": -412.8828430175781, "loss": 0.6282, "rewards/accuracies": 0.5, "rewards/chosen": -1.6986844539642334, "rewards/margins": 0.08317200094461441, "rewards/rejected": -1.7818565368652344, "step": 406 }, { "epoch": 0.8700066800267201, "grad_norm": 12.223879763686206, "learning_rate": 4.951556604879048e-08, "logits/chosen": -0.6467772126197815, "logits/rejected": -0.6247937679290771, "logps/chosen": -442.8312683105469, "logps/rejected": -498.9490051269531, "loss": 0.6112, "rewards/accuracies": 0.625, "rewards/chosen": -1.812768816947937, "rewards/margins": 0.4102476239204407, "rewards/rejected": -2.2230165004730225, "step": 407 }, { "epoch": 0.8721442885771543, "grad_norm": 13.609539677706314, "learning_rate": 4.7905462065429946e-08, "logits/chosen": -0.838919997215271, "logits/rejected": -0.8245532512664795, "logps/chosen": -415.890869140625, "logps/rejected": -435.3166198730469, "loss": 0.6788, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5928852558135986, "rewards/margins": 0.25430333614349365, "rewards/rejected": -1.8471887111663818, "step": 408 }, { "epoch": 0.8742818971275885, "grad_norm": 10.454604993981434, "learning_rate": 4.6320652716067555e-08, "logits/chosen": -0.7226736545562744, "logits/rejected": -0.7249311208724976, "logps/chosen": -406.7791748046875, "logps/rejected": -448.5416259765625, "loss": 0.609, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5601541996002197, "rewards/margins": 0.37674978375434875, "rewards/rejected": -1.936903953552246, "step": 409 }, { "epoch": 0.8764195056780227, "grad_norm": 10.922415025272509, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -0.7072638869285583, "logits/rejected": -0.6469031572341919, "logps/chosen": -437.16253662109375, "logps/rejected": -464.5462951660156, "loss": 0.6033, "rewards/accuracies": 0.59375, "rewards/chosen": -1.673638105392456, "rewards/margins": 0.27518972754478455, "rewards/rejected": -1.948827862739563, "step": 410 }, { "epoch": 0.878557114228457, "grad_norm": 11.15308213585594, "learning_rate": 4.322727117869951e-08, "logits/chosen": -0.5786024332046509, "logits/rejected": -0.5698223114013672, "logps/chosen": -387.2678527832031, "logps/rejected": -420.85101318359375, "loss": 0.6038, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5769679546356201, "rewards/margins": 0.32589417695999146, "rewards/rejected": -1.9028621912002563, "step": 411 }, { "epoch": 0.8806947227788912, "grad_norm": 14.488282375387797, "learning_rate": 4.17188720650119e-08, "logits/chosen": -0.7604373097419739, "logits/rejected": -0.7526075839996338, "logps/chosen": -510.45159912109375, "logps/rejected": -509.4095458984375, "loss": 0.6893, "rewards/accuracies": 0.5625, "rewards/chosen": -2.090540885925293, "rewards/margins": 0.033110879361629486, "rewards/rejected": -2.1236515045166016, "step": 412 }, { "epoch": 0.8828323313293254, "grad_norm": 11.024440296301012, "learning_rate": 4.023611372427471e-08, "logits/chosen": -0.7349828481674194, "logits/rejected": -0.7459964156150818, "logps/chosen": -388.6877746582031, "logps/rejected": -420.792236328125, "loss": 0.5967, "rewards/accuracies": 0.625, "rewards/chosen": -1.7269947528839111, "rewards/margins": 0.28061679005622864, "rewards/rejected": -2.0076115131378174, "step": 413 }, { "epoch": 0.8849699398797595, "grad_norm": 11.21797390523024, "learning_rate": 3.877907911663542e-08, "logits/chosen": -0.6687692403793335, "logits/rejected": -0.6710121631622314, "logps/chosen": -361.7718200683594, "logps/rejected": -406.8067321777344, "loss": 0.5766, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2828166484832764, "rewards/margins": 0.4076838493347168, "rewards/rejected": -1.6905003786087036, "step": 414 }, { "epoch": 0.8871075484301937, "grad_norm": 11.310104114342186, "learning_rate": 3.734784976300165e-08, "logits/chosen": -0.7112718820571899, "logits/rejected": -0.6793174743652344, "logps/chosen": -395.9449768066406, "logps/rejected": -415.42108154296875, "loss": 0.6427, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5930345058441162, "rewards/margins": 0.189566969871521, "rewards/rejected": -1.7826014757156372, "step": 415 }, { "epoch": 0.8892451569806279, "grad_norm": 11.184536783628738, "learning_rate": 3.594250574048058e-08, "logits/chosen": -0.6613335609436035, "logits/rejected": -0.6428050994873047, "logps/chosen": -367.02874755859375, "logps/rejected": -389.9844970703125, "loss": 0.6174, "rewards/accuracies": 0.65625, "rewards/chosen": -1.537019968032837, "rewards/margins": 0.1499500423669815, "rewards/rejected": -1.6869698762893677, "step": 416 }, { "epoch": 0.8913827655310621, "grad_norm": 12.205433845637979, "learning_rate": 3.456312567789793e-08, "logits/chosen": -0.7070876955986023, "logits/rejected": -0.7160503268241882, "logps/chosen": -469.0753173828125, "logps/rejected": -494.7930908203125, "loss": 0.6228, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9284939765930176, "rewards/margins": 0.23351570963859558, "rewards/rejected": -2.1620097160339355, "step": 417 }, { "epoch": 0.8935203740814963, "grad_norm": 11.816949999049964, "learning_rate": 3.3209786751399184e-08, "logits/chosen": -0.6653708815574646, "logits/rejected": -0.6532600522041321, "logps/chosen": -464.9654541015625, "logps/rejected": -504.6747741699219, "loss": 0.5449, "rewards/accuracies": 0.75, "rewards/chosen": -1.9524656534194946, "rewards/margins": 0.3868550658226013, "rewards/rejected": -2.339320659637451, "step": 418 }, { "epoch": 0.8956579826319305, "grad_norm": 10.879487484866441, "learning_rate": 3.188256468013139e-08, "logits/chosen": -0.6497898101806641, "logits/rejected": -0.6454100608825684, "logps/chosen": -478.6482238769531, "logps/rejected": -530.9612426757812, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": -1.8596200942993164, "rewards/margins": 0.5317977070808411, "rewards/rejected": -2.3914177417755127, "step": 419 }, { "epoch": 0.8977955911823647, "grad_norm": 11.880007198303698, "learning_rate": 3.058153372200695e-08, "logits/chosen": -0.6183308959007263, "logits/rejected": -0.6003840565681458, "logps/chosen": -459.9405212402344, "logps/rejected": -505.4646911621094, "loss": 0.6181, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9765161275863647, "rewards/margins": 0.4471958875656128, "rewards/rejected": -2.4237117767333984, "step": 420 }, { "epoch": 0.899933199732799, "grad_norm": 13.426952468351809, "learning_rate": 2.9306766669548457e-08, "logits/chosen": -0.7094901204109192, "logits/rejected": -0.6653531193733215, "logps/chosen": -466.24029541015625, "logps/rejected": -487.9982604980469, "loss": 0.5993, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9488886594772339, "rewards/margins": 0.351326048374176, "rewards/rejected": -2.3002147674560547, "step": 421 }, { "epoch": 0.9020708082832332, "grad_norm": 11.556511924801233, "learning_rate": 2.805833484581621e-08, "logits/chosen": -0.8073502779006958, "logits/rejected": -0.7438942790031433, "logps/chosen": -459.5665588378906, "logps/rejected": -462.51519775390625, "loss": 0.5975, "rewards/accuracies": 0.53125, "rewards/chosen": -1.806351900100708, "rewards/margins": 0.16977502405643463, "rewards/rejected": -1.976126790046692, "step": 422 }, { "epoch": 0.9042084168336674, "grad_norm": 12.210595464753169, "learning_rate": 2.6836308100417872e-08, "logits/chosen": -0.6977376341819763, "logits/rejected": -0.6720814108848572, "logps/chosen": -427.6357727050781, "logps/rejected": -460.1345520019531, "loss": 0.5831, "rewards/accuracies": 0.78125, "rewards/chosen": -1.711733102798462, "rewards/margins": 0.44918665289878845, "rewards/rejected": -2.1609199047088623, "step": 423 }, { "epoch": 0.9063460253841016, "grad_norm": 9.940416325858004, "learning_rate": 2.5640754805600128e-08, "logits/chosen": -0.7047473788261414, "logits/rejected": -0.7050879597663879, "logps/chosen": -355.5130615234375, "logps/rejected": -383.0091552734375, "loss": 0.6143, "rewards/accuracies": 0.625, "rewards/chosen": -1.397302508354187, "rewards/margins": 0.2378145009279251, "rewards/rejected": -1.6351170539855957, "step": 424 }, { "epoch": 0.9084836339345357, "grad_norm": 12.20154656454581, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -0.7828183174133301, "logits/rejected": -0.7872889041900635, "logps/chosen": -358.20751953125, "logps/rejected": -394.5291748046875, "loss": 0.5956, "rewards/accuracies": 0.75, "rewards/chosen": -1.3747402429580688, "rewards/margins": 0.26737523078918457, "rewards/rejected": -1.6421154737472534, "step": 425 }, { "epoch": 0.9106212424849699, "grad_norm": 10.941418508752523, "learning_rate": 2.3329334647018694e-08, "logits/chosen": -0.6170888543128967, "logits/rejected": -0.5692444443702698, "logps/chosen": -472.42864990234375, "logps/rejected": -516.1029052734375, "loss": 0.5838, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0783095359802246, "rewards/margins": 0.4643981158733368, "rewards/rejected": -2.542707681655884, "step": 426 }, { "epoch": 0.9127588510354041, "grad_norm": 11.303024362125633, "learning_rate": 2.2213597106929605e-08, "logits/chosen": -0.5531542301177979, "logits/rejected": -0.5305842161178589, "logps/chosen": -422.59100341796875, "logps/rejected": -460.8222961425781, "loss": 0.6129, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7473095655441284, "rewards/margins": 0.3811667263507843, "rewards/rejected": -2.12847638130188, "step": 427 }, { "epoch": 0.9148964595858383, "grad_norm": 14.94800877117672, "learning_rate": 2.1124591657534774e-08, "logits/chosen": -0.6627920866012573, "logits/rejected": -0.6768360733985901, "logps/chosen": -437.7267150878906, "logps/rejected": -494.59075927734375, "loss": 0.6108, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8751461505889893, "rewards/margins": 0.4180339574813843, "rewards/rejected": -2.293180227279663, "step": 428 }, { "epoch": 0.9170340681362725, "grad_norm": 11.706241248792244, "learning_rate": 2.0062379228555525e-08, "logits/chosen": -0.6479263305664062, "logits/rejected": -0.6203778386116028, "logps/chosen": -371.62310791015625, "logps/rejected": -380.03436279296875, "loss": 0.6172, "rewards/accuracies": 0.625, "rewards/chosen": -1.4537248611450195, "rewards/margins": 0.1625545471906662, "rewards/rejected": -1.6162794828414917, "step": 429 }, { "epoch": 0.9191716766867067, "grad_norm": 11.343840156206866, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -0.7142120003700256, "logits/rejected": -0.7347142696380615, "logps/chosen": -411.76312255859375, "logps/rejected": -476.20697021484375, "loss": 0.5984, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6978678703308105, "rewards/margins": 0.46169888973236084, "rewards/rejected": -2.159566640853882, "step": 430 }, { "epoch": 0.921309285237141, "grad_norm": 10.191297953603781, "learning_rate": 1.8018569652073378e-08, "logits/chosen": -0.5895026922225952, "logits/rejected": -0.5850787162780762, "logps/chosen": -406.5594482421875, "logps/rejected": -485.04217529296875, "loss": 0.593, "rewards/accuracies": 0.84375, "rewards/chosen": -1.616127371788025, "rewards/margins": 0.5671988129615784, "rewards/rejected": -2.183326244354248, "step": 431 }, { "epoch": 0.9234468937875752, "grad_norm": 12.93643748441664, "learning_rate": 1.7037086855465898e-08, "logits/chosen": -0.7007228136062622, "logits/rejected": -0.6858587265014648, "logps/chosen": -412.77496337890625, "logps/rejected": -458.0037841796875, "loss": 0.6264, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7415974140167236, "rewards/margins": 0.3250362277030945, "rewards/rejected": -2.066633701324463, "step": 432 }, { "epoch": 0.9255845023380094, "grad_norm": 12.402383758119484, "learning_rate": 1.6082625774666792e-08, "logits/chosen": -0.6870225667953491, "logits/rejected": -0.6988283395767212, "logps/chosen": -401.1902770996094, "logps/rejected": -415.27093505859375, "loss": 0.5975, "rewards/accuracies": 0.59375, "rewards/chosen": -1.706886649131775, "rewards/margins": 0.06286803632974625, "rewards/rejected": -1.7697547674179077, "step": 433 }, { "epoch": 0.9277221108884436, "grad_norm": 11.598555340742855, "learning_rate": 1.5155239811656562e-08, "logits/chosen": -0.7391936182975769, "logits/rejected": -0.7346464395523071, "logps/chosen": -362.8863525390625, "logps/rejected": -407.58392333984375, "loss": 0.5696, "rewards/accuracies": 0.75, "rewards/chosen": -1.4570672512054443, "rewards/margins": 0.35567349195480347, "rewards/rejected": -1.8127408027648926, "step": 434 }, { "epoch": 0.9298597194388778, "grad_norm": 12.334039197101353, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -0.7256093621253967, "logits/rejected": -0.7030697464942932, "logps/chosen": -498.5211181640625, "logps/rejected": -495.9085693359375, "loss": 0.6381, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0069806575775146, "rewards/margins": 0.12008260935544968, "rewards/rejected": -2.127063274383545, "step": 435 }, { "epoch": 0.931997327989312, "grad_norm": 10.831899946588043, "learning_rate": 1.3381899269774289e-08, "logits/chosen": -0.7507193088531494, "logits/rejected": -0.7519603967666626, "logps/chosen": -359.82000732421875, "logps/rejected": -395.96771240234375, "loss": 0.5773, "rewards/accuracies": 0.84375, "rewards/chosen": -1.372948408126831, "rewards/margins": 0.3809196352958679, "rewards/rejected": -1.7538681030273438, "step": 436 }, { "epoch": 0.9341349365397461, "grad_norm": 14.210280355388763, "learning_rate": 1.253604390908819e-08, "logits/chosen": -0.5923041701316833, "logits/rejected": -0.6011568307876587, "logps/chosen": -345.30633544921875, "logps/rejected": -392.51751708984375, "loss": 0.6674, "rewards/accuracies": 0.71875, "rewards/chosen": -1.585827350616455, "rewards/margins": 0.3826131224632263, "rewards/rejected": -1.968440294265747, "step": 437 }, { "epoch": 0.9362725450901803, "grad_norm": 9.851284631791968, "learning_rate": 1.1717462097011855e-08, "logits/chosen": -0.6331924796104431, "logits/rejected": -0.6489231586456299, "logps/chosen": -429.216796875, "logps/rejected": -477.13836669921875, "loss": 0.5764, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8334829807281494, "rewards/margins": 0.3744773864746094, "rewards/rejected": -2.207960605621338, "step": 438 }, { "epoch": 0.9384101536406145, "grad_norm": 11.272588511911392, "learning_rate": 1.0926199633097154e-08, "logits/chosen": -0.5822413563728333, "logits/rejected": -0.5487803220748901, "logps/chosen": -428.51934814453125, "logps/rejected": -487.8909606933594, "loss": 0.6055, "rewards/accuracies": 0.625, "rewards/chosen": -1.6083500385284424, "rewards/margins": 0.4286819398403168, "rewards/rejected": -2.037031888961792, "step": 439 }, { "epoch": 0.9405477621910487, "grad_norm": 10.801295437898501, "learning_rate": 1.016230078838226e-08, "logits/chosen": -0.7505050897598267, "logits/rejected": -0.7208874225616455, "logps/chosen": -511.32110595703125, "logps/rejected": -563.9736938476562, "loss": 0.5874, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1491966247558594, "rewards/margins": 0.5160467624664307, "rewards/rejected": -2.665243625640869, "step": 440 }, { "epoch": 0.942685370741483, "grad_norm": 12.022377343895434, "learning_rate": 9.425808302913728e-09, "logits/chosen": -0.6826910972595215, "logits/rejected": -0.7009281516075134, "logps/chosen": -396.803466796875, "logps/rejected": -475.8189697265625, "loss": 0.5696, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5288541316986084, "rewards/margins": 0.5459466576576233, "rewards/rejected": -2.074800968170166, "step": 441 }, { "epoch": 0.9448229792919172, "grad_norm": 11.529829218714097, "learning_rate": 8.716763383355862e-09, "logits/chosen": -0.6541940569877625, "logits/rejected": -0.6755858063697815, "logps/chosen": -480.7030944824219, "logps/rejected": -526.9130249023438, "loss": 0.5949, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1652467250823975, "rewards/margins": 0.42291441559791565, "rewards/rejected": -2.588160991668701, "step": 442 }, { "epoch": 0.9469605878423514, "grad_norm": 11.296144808610602, "learning_rate": 8.035205700685165e-09, "logits/chosen": -0.5620754361152649, "logits/rejected": -0.5832556486129761, "logps/chosen": -406.50115966796875, "logps/rejected": -483.1033935546875, "loss": 0.5998, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7801560163497925, "rewards/margins": 0.6258376240730286, "rewards/rejected": -2.4059934616088867, "step": 443 }, { "epoch": 0.9490981963927856, "grad_norm": 14.459436916194253, "learning_rate": 7.381173387970397e-09, "logits/chosen": -0.6875967979431152, "logits/rejected": -0.7037211060523987, "logps/chosen": -387.79193115234375, "logps/rejected": -406.17730712890625, "loss": 0.625, "rewards/accuracies": 0.59375, "rewards/chosen": -1.775752067565918, "rewards/margins": 0.16077642142772675, "rewards/rejected": -1.9365284442901611, "step": 444 }, { "epoch": 0.9512358049432198, "grad_norm": 12.667943463380384, "learning_rate": 6.754703038239329e-09, "logits/chosen": -0.6868771314620972, "logits/rejected": -0.6806486248970032, "logps/chosen": -391.65704345703125, "logps/rejected": -439.91534423828125, "loss": 0.616, "rewards/accuracies": 0.75, "rewards/chosen": -1.5933175086975098, "rewards/margins": 0.549602746963501, "rewards/rejected": -2.14292049407959, "step": 445 }, { "epoch": 0.953373413493654, "grad_norm": 13.999625605626278, "learning_rate": 6.15582970243117e-09, "logits/chosen": -0.679996132850647, "logits/rejected": -0.6954419612884521, "logps/chosen": -411.924072265625, "logps/rejected": -465.689453125, "loss": 0.5689, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5961847305297852, "rewards/margins": 0.44558507204055786, "rewards/rejected": -2.0417697429656982, "step": 446 }, { "epoch": 0.9555110220440882, "grad_norm": 10.363695148382087, "learning_rate": 5.5845868874357385e-09, "logits/chosen": -0.6567386388778687, "logits/rejected": -0.6833846569061279, "logps/chosen": -491.9205627441406, "logps/rejected": -569.992431640625, "loss": 0.5532, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7513771057128906, "rewards/margins": 0.6100505590438843, "rewards/rejected": -2.3614273071289062, "step": 447 }, { "epoch": 0.9576486305945224, "grad_norm": 13.618449735155838, "learning_rate": 5.0410065542185184e-09, "logits/chosen": -0.5561550855636597, "logits/rejected": -0.5477365851402283, "logps/chosen": -404.7331848144531, "logps/rejected": -456.8463134765625, "loss": 0.5897, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8092013597488403, "rewards/margins": 0.39521071314811707, "rewards/rejected": -2.2044119834899902, "step": 448 }, { "epoch": 0.9597862391449565, "grad_norm": 11.691098922849898, "learning_rate": 4.5251191160326495e-09, "logits/chosen": -0.7571395039558411, "logits/rejected": -0.6862713098526001, "logps/chosen": -404.239501953125, "logps/rejected": -430.745849609375, "loss": 0.6223, "rewards/accuracies": 0.59375, "rewards/chosen": -1.5349277257919312, "rewards/margins": 0.27732717990875244, "rewards/rejected": -1.8122549057006836, "step": 449 }, { "epoch": 0.9619238476953907, "grad_norm": 11.212537524360101, "learning_rate": 4.036953436716895e-09, "logits/chosen": -0.6666488647460938, "logits/rejected": -0.6362468600273132, "logps/chosen": -390.62713623046875, "logps/rejected": -432.0528564453125, "loss": 0.5807, "rewards/accuracies": 0.6875, "rewards/chosen": -1.528037428855896, "rewards/margins": 0.3787933588027954, "rewards/rejected": -1.9068307876586914, "step": 450 }, { "epoch": 0.964061456245825, "grad_norm": 11.304696270661244, "learning_rate": 3.5765368290813223e-09, "logits/chosen": -0.6946466565132141, "logits/rejected": -0.7254693508148193, "logps/chosen": -417.11187744140625, "logps/rejected": -472.14288330078125, "loss": 0.6012, "rewards/accuracies": 0.71875, "rewards/chosen": -1.704296588897705, "rewards/margins": 0.47184205055236816, "rewards/rejected": -2.1761388778686523, "step": 451 }, { "epoch": 0.9661990647962592, "grad_norm": 12.23326718620082, "learning_rate": 3.1438950533786977e-09, "logits/chosen": -0.727628767490387, "logits/rejected": -0.7244228720664978, "logps/chosen": -368.25653076171875, "logps/rejected": -406.3876647949219, "loss": 0.6045, "rewards/accuracies": 0.71875, "rewards/chosen": -1.741332769393921, "rewards/margins": 0.22878167033195496, "rewards/rejected": -1.9701144695281982, "step": 452 }, { "epoch": 0.9683366733466934, "grad_norm": 12.889743033859292, "learning_rate": 2.739052315863355e-09, "logits/chosen": -0.7463970184326172, "logits/rejected": -0.7229277491569519, "logps/chosen": -395.8512878417969, "logps/rejected": -452.2123718261719, "loss": 0.5944, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4541352987289429, "rewards/margins": 0.4854976534843445, "rewards/rejected": -1.9396328926086426, "step": 453 }, { "epoch": 0.9704742818971276, "grad_norm": 11.397457544749724, "learning_rate": 2.3620312674367816e-09, "logits/chosen": -0.7733277678489685, "logits/rejected": -0.761780858039856, "logps/chosen": -469.01544189453125, "logps/rejected": -496.7162780761719, "loss": 0.6331, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8822633028030396, "rewards/margins": 0.15627171099185944, "rewards/rejected": -2.0385348796844482, "step": 454 }, { "epoch": 0.9726118904475618, "grad_norm": 12.018897554978574, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -0.6989326477050781, "logits/rejected": -0.7310012578964233, "logps/chosen": -407.86639404296875, "logps/rejected": -467.86798095703125, "loss": 0.5709, "rewards/accuracies": 0.65625, "rewards/chosen": -1.7583638429641724, "rewards/margins": 0.5012065172195435, "rewards/rejected": -2.2595701217651367, "step": 455 }, { "epoch": 0.974749498997996, "grad_norm": 12.451774147613516, "learning_rate": 1.6915370571756181e-09, "logits/chosen": -0.7267682552337646, "logits/rejected": -0.7152563333511353, "logps/chosen": -450.92230224609375, "logps/rejected": -483.6854248046875, "loss": 0.6282, "rewards/accuracies": 0.53125, "rewards/chosen": -1.7722840309143066, "rewards/margins": 0.15732887387275696, "rewards/rejected": -1.9296131134033203, "step": 456 }, { "epoch": 0.9768871075484302, "grad_norm": 12.126324151502713, "learning_rate": 1.3981014094099353e-09, "logits/chosen": -0.7544288635253906, "logits/rejected": -0.7525961995124817, "logps/chosen": -397.43109130859375, "logps/rejected": -431.8571472167969, "loss": 0.5846, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4742791652679443, "rewards/margins": 0.3458634316921234, "rewards/rejected": -1.8201426267623901, "step": 457 }, { "epoch": 0.9790247160988644, "grad_norm": 9.907495266444734, "learning_rate": 1.1325624767719588e-09, "logits/chosen": -0.6586907505989075, "logits/rejected": -0.6237790584564209, "logps/chosen": -395.1099853515625, "logps/rejected": -438.49237060546875, "loss": 0.5992, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5770848989486694, "rewards/margins": 0.40755948424339294, "rewards/rejected": -1.9846441745758057, "step": 458 }, { "epoch": 0.9811623246492986, "grad_norm": 13.123065251970033, "learning_rate": 8.949351161324225e-10, "logits/chosen": -0.6515368223190308, "logits/rejected": -0.6513477563858032, "logps/chosen": -411.0286560058594, "logps/rejected": -474.112548828125, "loss": 0.621, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7529451847076416, "rewards/margins": 0.5362708568572998, "rewards/rejected": -2.2892158031463623, "step": 459 }, { "epoch": 0.9832999331997327, "grad_norm": 12.071426108415315, "learning_rate": 6.852326227130833e-10, "logits/chosen": -0.7456957697868347, "logits/rejected": -0.6752879023551941, "logps/chosen": -450.5098876953125, "logps/rejected": -455.85577392578125, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": -1.8361914157867432, "rewards/margins": 0.20122388005256653, "rewards/rejected": -2.037415027618408, "step": 460 }, { "epoch": 0.985437541750167, "grad_norm": 11.748757454977515, "learning_rate": 5.034667293427053e-10, "logits/chosen": -0.7174670696258545, "logits/rejected": -0.6987491250038147, "logps/chosen": -434.594482421875, "logps/rejected": -480.0499267578125, "loss": 0.6146, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7917670011520386, "rewards/margins": 0.3851429224014282, "rewards/rejected": -2.176909923553467, "step": 461 }, { "epoch": 0.9875751503006012, "grad_norm": 20.824968125195323, "learning_rate": 3.4964760580069585e-10, "logits/chosen": -0.555869460105896, "logits/rejected": -0.5152798891067505, "logps/chosen": -407.359375, "logps/rejected": -415.696533203125, "loss": 0.6489, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7928351163864136, "rewards/margins": 0.13394100964069366, "rewards/rejected": -1.9267761707305908, "step": 462 }, { "epoch": 0.9897127588510354, "grad_norm": 11.441495646450653, "learning_rate": 2.2378385824833866e-10, "logits/chosen": -0.7355363965034485, "logits/rejected": -0.727141261100769, "logps/chosen": -411.1090087890625, "logps/rejected": -475.9149475097656, "loss": 0.6229, "rewards/accuracies": 0.625, "rewards/chosen": -1.861409068107605, "rewards/margins": 0.488926500082016, "rewards/rejected": -2.3503353595733643, "step": 463 }, { "epoch": 0.9918503674014696, "grad_norm": 11.714079308527252, "learning_rate": 1.2588252874673466e-10, "logits/chosen": -0.8587902784347534, "logits/rejected": -0.8111391663551331, "logps/chosen": -470.9283447265625, "logps/rejected": -455.00372314453125, "loss": 0.6381, "rewards/accuracies": 0.53125, "rewards/chosen": -1.885197639465332, "rewards/margins": 0.07718580961227417, "rewards/rejected": -1.9623833894729614, "step": 464 }, { "epoch": 0.9939879759519038, "grad_norm": 16.820068325011835, "learning_rate": 5.594909486328348e-11, "logits/chosen": -0.5276237726211548, "logits/rejected": -0.5462942123413086, "logps/chosen": -459.17626953125, "logps/rejected": -478.39056396484375, "loss": 0.6987, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8299469947814941, "rewards/margins": 0.1971490979194641, "rewards/rejected": -2.0270960330963135, "step": 465 }, { "epoch": 0.996125584502338, "grad_norm": 10.981775368002003, "learning_rate": 1.3987469365095429e-11, "logits/chosen": -0.787868082523346, "logits/rejected": -0.8146266937255859, "logps/chosen": -463.4134826660156, "logps/rejected": -492.61859130859375, "loss": 0.5814, "rewards/accuracies": 0.625, "rewards/chosen": -1.6699291467666626, "rewards/margins": 0.16003668308258057, "rewards/rejected": -1.8299658298492432, "step": 466 }, { "epoch": 0.9982631930527722, "grad_norm": 10.869184778690476, "learning_rate": 0.0, "logits/chosen": -0.7284511923789978, "logits/rejected": -0.7266198992729187, "logps/chosen": -403.50836181640625, "logps/rejected": -409.22369384765625, "loss": 0.6449, "rewards/accuracies": 0.625, "rewards/chosen": -1.5633559226989746, "rewards/margins": 0.2905767261981964, "rewards/rejected": -1.8539327383041382, "step": 467 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 0.6321173631915189, "train_runtime": 21471.9268, "train_samples_per_second": 2.789, "train_steps_per_second": 0.022 } ], "logging_steps": 1, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }