Llama-3-8B-Magpie-Align-v0.1 / trainer_state.json
flydust's picture
Model save
568bba4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982631930527722,
"eval_steps": 100,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021376085504342017,
"grad_norm": 4.503899550790205,
"learning_rate": 2.127659574468085e-08,
"logits/chosen": -0.8003637194633484,
"logits/rejected": -0.8448871970176697,
"logps/chosen": -212.04685974121094,
"logps/rejected": -206.4463348388672,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0042752171008684035,
"grad_norm": 4.89256031461174,
"learning_rate": 4.25531914893617e-08,
"logits/chosen": -0.750135064125061,
"logits/rejected": -0.7247368097305298,
"logps/chosen": -271.5355529785156,
"logps/rejected": -260.5343322753906,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.006412825651302605,
"grad_norm": 4.511049028695194,
"learning_rate": 6.382978723404254e-08,
"logits/chosen": -0.9132480621337891,
"logits/rejected": -0.9213609099388123,
"logps/chosen": -259.10791015625,
"logps/rejected": -262.6512756347656,
"loss": 0.6935,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0005805277032777667,
"rewards/margins": -0.001751818461343646,
"rewards/rejected": 0.0011712908744812012,
"step": 3
},
{
"epoch": 0.008550434201736807,
"grad_norm": 5.0258481504448485,
"learning_rate": 8.51063829787234e-08,
"logits/chosen": -0.8424134850502014,
"logits/rejected": -0.8080853223800659,
"logps/chosen": -251.00387573242188,
"logps/rejected": -255.1189422607422,
"loss": 0.6929,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0018655203748494387,
"rewards/margins": 0.0010831927647814155,
"rewards/rejected": 0.0007823276100680232,
"step": 4
},
{
"epoch": 0.01068804275217101,
"grad_norm": 4.75851133644133,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.9411681294441223,
"logits/rejected": -0.9376619458198547,
"logps/chosen": -289.8980407714844,
"logps/rejected": -274.7005615234375,
"loss": 0.6929,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0026531934272497892,
"rewards/margins": 0.0023759508039802313,
"rewards/rejected": 0.00027724262326955795,
"step": 5
},
{
"epoch": 0.01282565130260521,
"grad_norm": 4.443327602655402,
"learning_rate": 1.2765957446808508e-07,
"logits/chosen": -0.7161233425140381,
"logits/rejected": -0.6978777647018433,
"logps/chosen": -223.0089569091797,
"logps/rejected": -222.1771240234375,
"loss": 0.6934,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.0012396120000630617,
"rewards/margins": -0.0016972327139228582,
"rewards/rejected": 0.00045762062654830515,
"step": 6
},
{
"epoch": 0.014963259853039413,
"grad_norm": 5.506063836746189,
"learning_rate": 1.4893617021276595e-07,
"logits/chosen": -0.9607124924659729,
"logits/rejected": -0.9491544961929321,
"logps/chosen": -310.2432556152344,
"logps/rejected": -305.9755554199219,
"loss": 0.6926,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0012061572633683681,
"rewards/margins": -0.0002907347516156733,
"rewards/rejected": 0.0014968919567763805,
"step": 7
},
{
"epoch": 0.017100868403473614,
"grad_norm": 4.851635423100062,
"learning_rate": 1.702127659574468e-07,
"logits/chosen": -0.8928542137145996,
"logits/rejected": -0.8853560090065002,
"logps/chosen": -247.1142120361328,
"logps/rejected": -244.08663940429688,
"loss": 0.693,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0002220773312728852,
"rewards/margins": 0.0013035106239840388,
"rewards/rejected": -0.0010814334964379668,
"step": 8
},
{
"epoch": 0.019238476953907815,
"grad_norm": 4.87939101936585,
"learning_rate": 1.9148936170212765e-07,
"logits/chosen": -0.8140461444854736,
"logits/rejected": -0.8076512813568115,
"logps/chosen": -272.2711486816406,
"logps/rejected": -284.1283264160156,
"loss": 0.6935,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.0010817217407748103,
"rewards/margins": -0.001508195186033845,
"rewards/rejected": 0.0004264736198820174,
"step": 9
},
{
"epoch": 0.02137608550434202,
"grad_norm": 4.562355516566984,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.8849949836730957,
"logits/rejected": -0.8811756372451782,
"logps/chosen": -229.57052612304688,
"logps/rejected": -231.6889190673828,
"loss": 0.6931,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00039355267654173076,
"rewards/margins": 0.0002220940077677369,
"rewards/rejected": 0.0001714585960144177,
"step": 10
},
{
"epoch": 0.02351369405477622,
"grad_norm": 4.67288441235731,
"learning_rate": 2.3404255319148937e-07,
"logits/chosen": -0.8189717531204224,
"logits/rejected": -0.8200615644454956,
"logps/chosen": -273.552734375,
"logps/rejected": -277.36859130859375,
"loss": 0.6934,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0009033679380081594,
"rewards/margins": -0.0013172316830605268,
"rewards/rejected": 0.0004138636286370456,
"step": 11
},
{
"epoch": 0.02565130260521042,
"grad_norm": 4.805681101367893,
"learning_rate": 2.5531914893617016e-07,
"logits/chosen": -0.9043698906898499,
"logits/rejected": -0.8993241190910339,
"logps/chosen": -273.664306640625,
"logps/rejected": -268.0246887207031,
"loss": 0.6929,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.0001288223429583013,
"rewards/margins": -3.8141035474836826e-05,
"rewards/rejected": 0.00016696332022547722,
"step": 12
},
{
"epoch": 0.02778891115564462,
"grad_norm": 4.91733558840618,
"learning_rate": 2.7659574468085106e-07,
"logits/chosen": -0.8745774626731873,
"logits/rejected": -0.8446710705757141,
"logps/chosen": -243.00827026367188,
"logps/rejected": -229.5283203125,
"loss": 0.693,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.002203774405643344,
"rewards/margins": -0.00030036212410777807,
"rewards/rejected": -0.0019034123979508877,
"step": 13
},
{
"epoch": 0.029926519706078826,
"grad_norm": 5.299324976103458,
"learning_rate": 2.978723404255319e-07,
"logits/chosen": -0.7348307967185974,
"logits/rejected": -0.7354189157485962,
"logps/chosen": -186.85391235351562,
"logps/rejected": -199.67623901367188,
"loss": 0.6932,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0016946769319474697,
"rewards/margins": -0.0006936597637832165,
"rewards/rejected": -0.001001017284579575,
"step": 14
},
{
"epoch": 0.03206412825651302,
"grad_norm": 4.755602904170831,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.7406636476516724,
"logits/rejected": -0.7166301608085632,
"logps/chosen": -199.6678466796875,
"logps/rejected": -194.37559509277344,
"loss": 0.6928,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0003121185291092843,
"rewards/margins": 0.0012607788667082787,
"rewards/rejected": -0.0009486603084951639,
"step": 15
},
{
"epoch": 0.03420173680694723,
"grad_norm": 4.853620806434979,
"learning_rate": 3.404255319148936e-07,
"logits/chosen": -0.78841632604599,
"logits/rejected": -0.7843498587608337,
"logps/chosen": -266.4180908203125,
"logps/rejected": -271.6226806640625,
"loss": 0.693,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0009592341957613826,
"rewards/margins": 0.0019549226853996515,
"rewards/rejected": -0.002914156997576356,
"step": 16
},
{
"epoch": 0.03633934535738143,
"grad_norm": 5.423827110862174,
"learning_rate": 3.617021276595745e-07,
"logits/chosen": -0.9736945629119873,
"logits/rejected": -0.9769234657287598,
"logps/chosen": -258.8900146484375,
"logps/rejected": -264.2679748535156,
"loss": 0.6935,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.0061743613332509995,
"rewards/margins": -0.001890932791866362,
"rewards/rejected": -0.0042834291234612465,
"step": 17
},
{
"epoch": 0.03847695390781563,
"grad_norm": 4.824497254280432,
"learning_rate": 3.829787234042553e-07,
"logits/chosen": -0.851763904094696,
"logits/rejected": -0.8533320426940918,
"logps/chosen": -273.1241760253906,
"logps/rejected": -269.42315673828125,
"loss": 0.6927,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0023198507260531187,
"rewards/margins": -0.0010572766186669469,
"rewards/rejected": -0.0012625741073861718,
"step": 18
},
{
"epoch": 0.040614562458249834,
"grad_norm": 4.885682499438778,
"learning_rate": 4.0425531914893614e-07,
"logits/chosen": -0.9122135043144226,
"logits/rejected": -0.9140520095825195,
"logps/chosen": -336.9332580566406,
"logps/rejected": -327.79571533203125,
"loss": 0.6932,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.007231764495372772,
"rewards/margins": -0.00179797422606498,
"rewards/rejected": -0.005433791317045689,
"step": 19
},
{
"epoch": 0.04275217100868404,
"grad_norm": 4.403742601709981,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.8458749055862427,
"logits/rejected": -0.8761993646621704,
"logps/chosen": -258.8704833984375,
"logps/rejected": -263.5494079589844,
"loss": 0.6928,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.005775632336735725,
"rewards/margins": 0.00025997147895395756,
"rewards/rejected": -0.006035604514181614,
"step": 20
},
{
"epoch": 0.044889779559118236,
"grad_norm": 5.1980600006783195,
"learning_rate": 4.4680851063829783e-07,
"logits/chosen": -0.7707018852233887,
"logits/rejected": -0.7247700691223145,
"logps/chosen": -233.66183471679688,
"logps/rejected": -255.91018676757812,
"loss": 0.6924,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.006643190514296293,
"rewards/margins": 0.0013489744160324335,
"rewards/rejected": -0.00799216516315937,
"step": 21
},
{
"epoch": 0.04702738810955244,
"grad_norm": 4.514553831312047,
"learning_rate": 4.6808510638297873e-07,
"logits/chosen": -0.8653970956802368,
"logits/rejected": -0.8456276059150696,
"logps/chosen": -245.4098663330078,
"logps/rejected": -248.41461181640625,
"loss": 0.6928,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.005286152008920908,
"rewards/margins": 0.000985494116321206,
"rewards/rejected": -0.006271645426750183,
"step": 22
},
{
"epoch": 0.04916499665998664,
"grad_norm": 4.80080663754473,
"learning_rate": 4.893617021276595e-07,
"logits/chosen": -0.8655314445495605,
"logits/rejected": -0.8451917171478271,
"logps/chosen": -252.33546447753906,
"logps/rejected": -260.81475830078125,
"loss": 0.6921,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.006377603858709335,
"rewards/margins": 0.00297079561278224,
"rewards/rejected": -0.009348399937152863,
"step": 23
},
{
"epoch": 0.05130260521042084,
"grad_norm": 5.481285264708149,
"learning_rate": 5.106382978723403e-07,
"logits/chosen": -0.7310451865196228,
"logits/rejected": -0.7366085648536682,
"logps/chosen": -238.02166748046875,
"logps/rejected": -245.17308044433594,
"loss": 0.6918,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.010805780068039894,
"rewards/margins": 0.0019894172437489033,
"rewards/rejected": -0.01279519684612751,
"step": 24
},
{
"epoch": 0.053440213760855046,
"grad_norm": 4.561792775392447,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -0.9254141449928284,
"logits/rejected": -0.939468502998352,
"logps/chosen": -269.6241455078125,
"logps/rejected": -282.4432067871094,
"loss": 0.6931,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.01286405324935913,
"rewards/margins": 0.001102518755942583,
"rewards/rejected": -0.013966571539640427,
"step": 25
},
{
"epoch": 0.05557782231128924,
"grad_norm": 4.85781011184185,
"learning_rate": 5.531914893617021e-07,
"logits/chosen": -0.8391819000244141,
"logits/rejected": -0.8546662330627441,
"logps/chosen": -271.26068115234375,
"logps/rejected": -267.31024169921875,
"loss": 0.6925,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.008641035296022892,
"rewards/margins": 0.0034573455341160297,
"rewards/rejected": -0.012098381295800209,
"step": 26
},
{
"epoch": 0.05771543086172345,
"grad_norm": 5.072033355975492,
"learning_rate": 5.74468085106383e-07,
"logits/chosen": -0.8844251036643982,
"logits/rejected": -0.8849300742149353,
"logps/chosen": -243.93980407714844,
"logps/rejected": -248.54537963867188,
"loss": 0.6927,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.015005933120846748,
"rewards/margins": 0.0034025944769382477,
"rewards/rejected": -0.018408527597784996,
"step": 27
},
{
"epoch": 0.05985303941215765,
"grad_norm": 4.905934366826652,
"learning_rate": 5.957446808510638e-07,
"logits/chosen": -0.724337637424469,
"logits/rejected": -0.7232470512390137,
"logps/chosen": -262.2066345214844,
"logps/rejected": -267.26116943359375,
"loss": 0.6919,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.014386076480150223,
"rewards/margins": -0.0017046784050762653,
"rewards/rejected": -0.012681398540735245,
"step": 28
},
{
"epoch": 0.06199064796259185,
"grad_norm": 4.7342802483142705,
"learning_rate": 6.170212765957446e-07,
"logits/chosen": -0.8244236707687378,
"logits/rejected": -0.8045285940170288,
"logps/chosen": -218.7688751220703,
"logps/rejected": -219.35711669921875,
"loss": 0.6898,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.014018207788467407,
"rewards/margins": 0.00474612507969141,
"rewards/rejected": -0.018764331936836243,
"step": 29
},
{
"epoch": 0.06412825651302605,
"grad_norm": 5.185028135772882,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -0.7685129642486572,
"logits/rejected": -0.7588883638381958,
"logps/chosen": -265.58447265625,
"logps/rejected": -271.6627502441406,
"loss": 0.6911,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.023013589903712273,
"rewards/margins": -6.0059886891394854e-05,
"rewards/rejected": -0.02295352704823017,
"step": 30
},
{
"epoch": 0.06626586506346026,
"grad_norm": 5.174402492219036,
"learning_rate": 6.595744680851063e-07,
"logits/chosen": -0.8060805797576904,
"logits/rejected": -0.8104574084281921,
"logps/chosen": -253.12918090820312,
"logps/rejected": -262.47772216796875,
"loss": 0.6926,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.027180161327123642,
"rewards/margins": 0.0009983510244637728,
"rewards/rejected": -0.028178514912724495,
"step": 31
},
{
"epoch": 0.06840347361389446,
"grad_norm": 4.839677584710031,
"learning_rate": 6.808510638297872e-07,
"logits/chosen": -0.8107847571372986,
"logits/rejected": -0.8056558966636658,
"logps/chosen": -247.47384643554688,
"logps/rejected": -259.930419921875,
"loss": 0.6922,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.03201708570122719,
"rewards/margins": 0.0023030471056699753,
"rewards/rejected": -0.03432013466954231,
"step": 32
},
{
"epoch": 0.07054108216432865,
"grad_norm": 4.418696566904475,
"learning_rate": 7.021276595744681e-07,
"logits/chosen": -0.8691257834434509,
"logits/rejected": -0.891472339630127,
"logps/chosen": -229.89974975585938,
"logps/rejected": -220.62893676757812,
"loss": 0.6925,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.023654459044337273,
"rewards/margins": 0.0017885996494442225,
"rewards/rejected": -0.025443056598305702,
"step": 33
},
{
"epoch": 0.07267869071476286,
"grad_norm": 5.281949481266581,
"learning_rate": 7.23404255319149e-07,
"logits/chosen": -0.7926970720291138,
"logits/rejected": -0.7971447706222534,
"logps/chosen": -201.50173950195312,
"logps/rejected": -209.24432373046875,
"loss": 0.6883,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0300833098590374,
"rewards/margins": 0.011433225125074387,
"rewards/rejected": -0.041516534984111786,
"step": 34
},
{
"epoch": 0.07481629926519706,
"grad_norm": 5.310361096502114,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -0.910358726978302,
"logits/rejected": -0.8681845664978027,
"logps/chosen": -293.49481201171875,
"logps/rejected": -264.9764709472656,
"loss": 0.6929,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.042106445878744125,
"rewards/margins": -0.00045869359746575356,
"rewards/rejected": -0.04164774715900421,
"step": 35
},
{
"epoch": 0.07695390781563126,
"grad_norm": 4.880148293966411,
"learning_rate": 7.659574468085106e-07,
"logits/chosen": -0.9195268154144287,
"logits/rejected": -0.9358838796615601,
"logps/chosen": -219.29908752441406,
"logps/rejected": -223.91160583496094,
"loss": 0.6905,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03970751538872719,
"rewards/margins": 0.009188718162477016,
"rewards/rejected": -0.04889623448252678,
"step": 36
},
{
"epoch": 0.07909151636606547,
"grad_norm": 4.918837324305735,
"learning_rate": 7.872340425531915e-07,
"logits/chosen": -0.7983888387680054,
"logits/rejected": -0.7829576134681702,
"logps/chosen": -236.22479248046875,
"logps/rejected": -230.52279663085938,
"loss": 0.6924,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03082606941461563,
"rewards/margins": 0.007684895768761635,
"rewards/rejected": -0.038510967046022415,
"step": 37
},
{
"epoch": 0.08122912491649967,
"grad_norm": 4.697759235789417,
"learning_rate": 8.085106382978723e-07,
"logits/chosen": -0.9536780118942261,
"logits/rejected": -0.9445628523826599,
"logps/chosen": -239.7415771484375,
"logps/rejected": -250.46978759765625,
"loss": 0.6915,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.04406914860010147,
"rewards/margins": 0.007817601785063744,
"rewards/rejected": -0.05188675597310066,
"step": 38
},
{
"epoch": 0.08336673346693386,
"grad_norm": 4.942849749477713,
"learning_rate": 8.297872340425532e-07,
"logits/chosen": -0.8406745195388794,
"logits/rejected": -0.8202511668205261,
"logps/chosen": -283.8332824707031,
"logps/rejected": -289.7784729003906,
"loss": 0.6883,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.0502498485147953,
"rewards/margins": 0.014361884444952011,
"rewards/rejected": -0.06461173295974731,
"step": 39
},
{
"epoch": 0.08550434201736808,
"grad_norm": 5.117709083830907,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.8214735984802246,
"logits/rejected": -0.811273992061615,
"logps/chosen": -210.29600524902344,
"logps/rejected": -199.48020935058594,
"loss": 0.6884,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.04497796297073364,
"rewards/margins": 0.01133386418223381,
"rewards/rejected": -0.05631183087825775,
"step": 40
},
{
"epoch": 0.08764195056780227,
"grad_norm": 5.136196664411302,
"learning_rate": 8.723404255319149e-07,
"logits/chosen": -0.969085693359375,
"logits/rejected": -0.9578003287315369,
"logps/chosen": -252.95278930664062,
"logps/rejected": -256.9606018066406,
"loss": 0.6848,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.07370009273290634,
"rewards/margins": 0.004004061222076416,
"rewards/rejected": -0.07770414650440216,
"step": 41
},
{
"epoch": 0.08977955911823647,
"grad_norm": 4.838693140519435,
"learning_rate": 8.936170212765957e-07,
"logits/chosen": -0.8661520481109619,
"logits/rejected": -0.8457835912704468,
"logps/chosen": -304.5137634277344,
"logps/rejected": -289.595947265625,
"loss": 0.6883,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07082202285528183,
"rewards/margins": 0.015582293272018433,
"rewards/rejected": -0.08640430867671967,
"step": 42
},
{
"epoch": 0.09191716766867067,
"grad_norm": 4.957200914658608,
"learning_rate": 9.148936170212766e-07,
"logits/chosen": -0.8786011338233948,
"logits/rejected": -0.8692121505737305,
"logps/chosen": -241.05532836914062,
"logps/rejected": -243.45684814453125,
"loss": 0.6919,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07279819995164871,
"rewards/margins": 0.020279541611671448,
"rewards/rejected": -0.09307773411273956,
"step": 43
},
{
"epoch": 0.09405477621910488,
"grad_norm": 5.332532100522966,
"learning_rate": 9.361702127659575e-07,
"logits/chosen": -0.714208722114563,
"logits/rejected": -0.7126749157905579,
"logps/chosen": -319.6092834472656,
"logps/rejected": -301.8595886230469,
"loss": 0.6873,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07854731380939484,
"rewards/margins": 0.005709480959922075,
"rewards/rejected": -0.08425679802894592,
"step": 44
},
{
"epoch": 0.09619238476953908,
"grad_norm": 5.165598994277126,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.8318406343460083,
"logits/rejected": -0.849963903427124,
"logps/chosen": -255.63446044921875,
"logps/rejected": -259.7432556152344,
"loss": 0.6883,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09484049677848816,
"rewards/margins": 0.010637722909450531,
"rewards/rejected": -0.10547823458909988,
"step": 45
},
{
"epoch": 0.09832999331997327,
"grad_norm": 4.871720241221463,
"learning_rate": 9.78723404255319e-07,
"logits/chosen": -0.8702428936958313,
"logits/rejected": -0.8339990377426147,
"logps/chosen": -316.18670654296875,
"logps/rejected": -329.9319152832031,
"loss": 0.6892,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.14160332083702087,
"rewards/margins": 0.013123790733516216,
"rewards/rejected": -0.15472710132598877,
"step": 46
},
{
"epoch": 0.10046760187040749,
"grad_norm": 5.158837089218199,
"learning_rate": 1e-06,
"logits/chosen": -0.8626521229743958,
"logits/rejected": -0.8603638410568237,
"logps/chosen": -247.8237762451172,
"logps/rejected": -249.759033203125,
"loss": 0.6913,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09033364802598953,
"rewards/margins": 0.0012807990424335003,
"rewards/rejected": -0.09161444753408432,
"step": 47
},
{
"epoch": 0.10260521042084168,
"grad_norm": 5.2967028714823785,
"learning_rate": 9.999860125306348e-07,
"logits/chosen": -0.8659788370132446,
"logits/rejected": -0.8618423342704773,
"logps/chosen": -272.1561279296875,
"logps/rejected": -280.98040771484375,
"loss": 0.6893,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11396947503089905,
"rewards/margins": -0.0073202308267354965,
"rewards/rejected": -0.106649249792099,
"step": 48
},
{
"epoch": 0.10474281897127588,
"grad_norm": 5.51515197326478,
"learning_rate": 9.999440509051367e-07,
"logits/chosen": -0.7946774363517761,
"logits/rejected": -0.8100728988647461,
"logps/chosen": -302.84283447265625,
"logps/rejected": -298.60955810546875,
"loss": 0.6849,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1010870561003685,
"rewards/margins": 0.017633313313126564,
"rewards/rejected": -0.11872036755084991,
"step": 49
},
{
"epoch": 0.10688042752171009,
"grad_norm": 5.870386943751237,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -0.90606290102005,
"logits/rejected": -0.9065860509872437,
"logps/chosen": -257.7372741699219,
"logps/rejected": -241.87298583984375,
"loss": 0.6821,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12090699374675751,
"rewards/margins": 0.024912692606449127,
"rewards/rejected": -0.14581969380378723,
"step": 50
},
{
"epoch": 0.10901803607214429,
"grad_norm": 5.544085731964276,
"learning_rate": 9.997762161417517e-07,
"logits/chosen": -0.8597516417503357,
"logits/rejected": -0.8242354393005371,
"logps/chosen": -244.0271759033203,
"logps/rejected": -262.000732421875,
"loss": 0.6771,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.11861881613731384,
"rewards/margins": 0.04306299239397049,
"rewards/rejected": -0.16168181598186493,
"step": 51
},
{
"epoch": 0.11115564462257849,
"grad_norm": 5.08779280072292,
"learning_rate": 9.996503523941992e-07,
"logits/chosen": -0.8984640836715698,
"logits/rejected": -0.8927853107452393,
"logps/chosen": -292.3353576660156,
"logps/rejected": -283.715576171875,
"loss": 0.6878,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.1378980576992035,
"rewards/margins": -0.0063457973301410675,
"rewards/rejected": -0.13155226409435272,
"step": 52
},
{
"epoch": 0.1132932531730127,
"grad_norm": 5.528861132333211,
"learning_rate": 9.994965332706572e-07,
"logits/chosen": -0.7924266457557678,
"logits/rejected": -0.7879197597503662,
"logps/chosen": -299.14617919921875,
"logps/rejected": -305.268798828125,
"loss": 0.6822,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.14223326742649078,
"rewards/margins": 0.029797088354825974,
"rewards/rejected": -0.17203034460544586,
"step": 53
},
{
"epoch": 0.1154308617234469,
"grad_norm": 5.45602380596692,
"learning_rate": 9.99314767377287e-07,
"logits/chosen": -0.9068719744682312,
"logits/rejected": -0.8776203393936157,
"logps/chosen": -288.920166015625,
"logps/rejected": -288.0073547363281,
"loss": 0.6782,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12053351104259491,
"rewards/margins": 0.04062645137310028,
"rewards/rejected": -0.161159947514534,
"step": 54
},
{
"epoch": 0.11756847027388109,
"grad_norm": 5.381814409133637,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -0.8684936165809631,
"logits/rejected": -0.8693514466285706,
"logps/chosen": -300.5417175292969,
"logps/rejected": -297.93609619140625,
"loss": 0.6852,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08863644301891327,
"rewards/margins": 0.03236062452197075,
"rewards/rejected": -0.12099706381559372,
"step": 55
},
{
"epoch": 0.1197060788243153,
"grad_norm": 4.981246770478922,
"learning_rate": 9.98867437523228e-07,
"logits/chosen": -0.7902661561965942,
"logits/rejected": -0.7963244915008545,
"logps/chosen": -302.9090576171875,
"logps/rejected": -296.0736389160156,
"loss": 0.6823,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.10501404106616974,
"rewards/margins": 0.045005738735198975,
"rewards/rejected": -0.15001976490020752,
"step": 56
},
{
"epoch": 0.1218436873747495,
"grad_norm": 5.95244558017509,
"learning_rate": 9.986018985905899e-07,
"logits/chosen": -0.933331310749054,
"logits/rejected": -0.9271438121795654,
"logps/chosen": -257.21197509765625,
"logps/rejected": -258.4394226074219,
"loss": 0.6847,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1376655399799347,
"rewards/margins": 0.020258434116840363,
"rewards/rejected": -0.15792396664619446,
"step": 57
},
{
"epoch": 0.1239812959251837,
"grad_norm": 5.625394184294828,
"learning_rate": 9.983084629428244e-07,
"logits/chosen": -0.790676474571228,
"logits/rejected": -0.7989400625228882,
"logps/chosen": -216.31825256347656,
"logps/rejected": -239.0472869873047,
"loss": 0.6822,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.12253975123167038,
"rewards/margins": 0.032085709273815155,
"rewards/rejected": -0.15462547540664673,
"step": 58
},
{
"epoch": 0.1261189044756179,
"grad_norm": 5.3018065912112835,
"learning_rate": 9.979871469976195e-07,
"logits/chosen": -0.7393543720245361,
"logits/rejected": -0.7129000425338745,
"logps/chosen": -311.56878662109375,
"logps/rejected": -291.1382751464844,
"loss": 0.6848,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.24215912818908691,
"rewards/margins": -0.015711378306150436,
"rewards/rejected": -0.22644776105880737,
"step": 59
},
{
"epoch": 0.1282565130260521,
"grad_norm": 5.89246728884038,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -0.8696700930595398,
"logits/rejected": -0.8711199760437012,
"logps/chosen": -246.91502380371094,
"logps/rejected": -262.1573791503906,
"loss": 0.6851,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1573953628540039,
"rewards/margins": 0.03878934681415558,
"rewards/rejected": -0.1961846947669983,
"step": 60
},
{
"epoch": 0.1303941215764863,
"grad_norm": 5.3302706046399555,
"learning_rate": 9.972609476841365e-07,
"logits/chosen": -0.915327787399292,
"logits/rejected": -0.8959137201309204,
"logps/chosen": -273.5627136230469,
"logps/rejected": -297.04962158203125,
"loss": 0.6851,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.21575269103050232,
"rewards/margins": 0.07058089226484299,
"rewards/rejected": -0.2863335907459259,
"step": 61
},
{
"epoch": 0.13253173012692052,
"grad_norm": 5.298125253303342,
"learning_rate": 9.968561049466213e-07,
"logits/chosen": -0.8035833239555359,
"logits/rejected": -0.8177482485771179,
"logps/chosen": -258.7190246582031,
"logps/rejected": -260.00408935546875,
"loss": 0.6761,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1654270738363266,
"rewards/margins": 0.03356565535068512,
"rewards/rejected": -0.19899272918701172,
"step": 62
},
{
"epoch": 0.1346693386773547,
"grad_norm": 5.644014199691322,
"learning_rate": 9.964234631709185e-07,
"logits/chosen": -0.8946092128753662,
"logits/rejected": -0.8983243703842163,
"logps/chosen": -272.2535095214844,
"logps/rejected": -278.0460205078125,
"loss": 0.6812,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.18145883083343506,
"rewards/margins": 0.05900725722312927,
"rewards/rejected": -0.24046610295772552,
"step": 63
},
{
"epoch": 0.1368069472277889,
"grad_norm": 6.088482546530936,
"learning_rate": 9.959630465632831e-07,
"logits/chosen": -0.8606098890304565,
"logits/rejected": -0.8623652458190918,
"logps/chosen": -256.6067199707031,
"logps/rejected": -273.53668212890625,
"loss": 0.6753,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.17427769303321838,
"rewards/margins": 0.05003924295306206,
"rewards/rejected": -0.22431692481040955,
"step": 64
},
{
"epoch": 0.13894455577822312,
"grad_norm": 5.611060962151761,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -0.8806792497634888,
"logits/rejected": -0.8958165645599365,
"logps/chosen": -275.52301025390625,
"logps/rejected": -273.21563720703125,
"loss": 0.6823,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.2500859200954437,
"rewards/margins": 0.017448339611291885,
"rewards/rejected": -0.2675342261791229,
"step": 65
},
{
"epoch": 0.1410821643286573,
"grad_norm": 5.918149017741809,
"learning_rate": 9.949589934457814e-07,
"logits/chosen": -0.8888027667999268,
"logits/rejected": -0.871585488319397,
"logps/chosen": -248.55703735351562,
"logps/rejected": -258.9693603515625,
"loss": 0.6824,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1963491439819336,
"rewards/margins": 0.04409556835889816,
"rewards/rejected": -0.24044471979141235,
"step": 66
},
{
"epoch": 0.14321977287909152,
"grad_norm": 6.698179177771139,
"learning_rate": 9.944154131125642e-07,
"logits/chosen": -0.853302001953125,
"logits/rejected": -0.848848819732666,
"logps/chosen": -277.59442138671875,
"logps/rejected": -297.14141845703125,
"loss": 0.6639,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.24216346442699432,
"rewards/margins": 0.10889790952205658,
"rewards/rejected": -0.3510614037513733,
"step": 67
},
{
"epoch": 0.14535738142952573,
"grad_norm": 5.596769283806181,
"learning_rate": 9.938441702975689e-07,
"logits/chosen": -0.7764022350311279,
"logits/rejected": -0.7560886144638062,
"logps/chosen": -250.94287109375,
"logps/rejected": -250.5952606201172,
"loss": 0.6732,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2156543880701065,
"rewards/margins": 0.03958458825945854,
"rewards/rejected": -0.25523898005485535,
"step": 68
},
{
"epoch": 0.1474949899799599,
"grad_norm": 5.913968144404886,
"learning_rate": 9.932452969617607e-07,
"logits/chosen": -0.7237470746040344,
"logits/rejected": -0.7399138808250427,
"logps/chosen": -244.21449279785156,
"logps/rejected": -254.1151123046875,
"loss": 0.6695,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1943284124135971,
"rewards/margins": 0.060313306748867035,
"rewards/rejected": -0.25464171171188354,
"step": 69
},
{
"epoch": 0.14963259853039412,
"grad_norm": 5.940310444508497,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -0.8615679144859314,
"logits/rejected": -0.8436312675476074,
"logps/chosen": -256.257080078125,
"logps/rejected": -262.7105712890625,
"loss": 0.679,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.17094658315181732,
"rewards/margins": 0.055061645805835724,
"rewards/rejected": -0.22600823640823364,
"step": 70
},
{
"epoch": 0.15177020708082833,
"grad_norm": 5.928886998788439,
"learning_rate": 9.919647942993147e-07,
"logits/chosen": -0.8513661623001099,
"logits/rejected": -0.8609136343002319,
"logps/chosen": -299.2288818359375,
"logps/rejected": -326.5621032714844,
"loss": 0.6711,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.22696439921855927,
"rewards/margins": 0.04341430217027664,
"rewards/rejected": -0.2703787088394165,
"step": 71
},
{
"epoch": 0.15390781563126252,
"grad_norm": 5.791936546761731,
"learning_rate": 9.912832366166441e-07,
"logits/chosen": -0.756388783454895,
"logits/rejected": -0.734666109085083,
"logps/chosen": -299.2653503417969,
"logps/rejected": -307.1020202636719,
"loss": 0.6727,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3383556604385376,
"rewards/margins": 0.01706152781844139,
"rewards/rejected": -0.3554171919822693,
"step": 72
},
{
"epoch": 0.15604542418169673,
"grad_norm": 6.05455714563325,
"learning_rate": 9.905741916970863e-07,
"logits/chosen": -0.9010551571846008,
"logits/rejected": -0.8836992383003235,
"logps/chosen": -339.32806396484375,
"logps/rejected": -335.24285888671875,
"loss": 0.6703,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.40806159377098083,
"rewards/margins": -0.019612746313214302,
"rewards/rejected": -0.38844889402389526,
"step": 73
},
{
"epoch": 0.15818303273213094,
"grad_norm": 6.2106979275919025,
"learning_rate": 9.898376992116177e-07,
"logits/chosen": -0.9612334370613098,
"logits/rejected": -0.9398088455200195,
"logps/chosen": -282.431640625,
"logps/rejected": -281.66558837890625,
"loss": 0.6763,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3227473497390747,
"rewards/margins": 0.047923244535923004,
"rewards/rejected": -0.3706705868244171,
"step": 74
},
{
"epoch": 0.16032064128256512,
"grad_norm": 5.916909013866816,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.8319401741027832,
"logits/rejected": -0.815265953540802,
"logps/chosen": -281.00518798828125,
"logps/rejected": -273.9776306152344,
"loss": 0.6605,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3827057480812073,
"rewards/margins": 0.0613156333565712,
"rewards/rejected": -0.4440213441848755,
"step": 75
},
{
"epoch": 0.16245824983299934,
"grad_norm": 6.67755131316461,
"learning_rate": 9.882825379029882e-07,
"logits/chosen": -0.8953054547309875,
"logits/rejected": -0.894780695438385,
"logps/chosen": -312.055908203125,
"logps/rejected": -330.704833984375,
"loss": 0.6602,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.45433858036994934,
"rewards/margins": 0.07449479401111603,
"rewards/rejected": -0.5288333892822266,
"step": 76
},
{
"epoch": 0.16459585838343355,
"grad_norm": 6.2117918809225765,
"learning_rate": 9.874639560909118e-07,
"logits/chosen": -0.9046330451965332,
"logits/rejected": -0.898413360118866,
"logps/chosen": -294.0129089355469,
"logps/rejected": -299.68560791015625,
"loss": 0.6753,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4101888835430145,
"rewards/margins": 0.12080243229866028,
"rewards/rejected": -0.5309913158416748,
"step": 77
},
{
"epoch": 0.16673346693386773,
"grad_norm": 5.632634022928361,
"learning_rate": 9.866181007302256e-07,
"logits/chosen": -0.6335713267326355,
"logits/rejected": -0.6313363313674927,
"logps/chosen": -281.41400146484375,
"logps/rejected": -291.63800048828125,
"loss": 0.6659,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.36711931228637695,
"rewards/margins": 0.1256605088710785,
"rewards/rejected": -0.49277979135513306,
"step": 78
},
{
"epoch": 0.16887107548430194,
"grad_norm": 6.069106827042514,
"learning_rate": 9.857450191464337e-07,
"logits/chosen": -0.7797252535820007,
"logits/rejected": -0.7820223569869995,
"logps/chosen": -256.88421630859375,
"logps/rejected": -279.4145812988281,
"loss": 0.653,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3781868815422058,
"rewards/margins": 0.07347656786441803,
"rewards/rejected": -0.45166343450546265,
"step": 79
},
{
"epoch": 0.17100868403473615,
"grad_norm": 6.074173522598461,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -0.8622775673866272,
"logits/rejected": -0.8331011533737183,
"logps/chosen": -309.4617614746094,
"logps/rejected": -330.5566101074219,
"loss": 0.6552,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.49035871028900146,
"rewards/margins": 0.17824454605579376,
"rewards/rejected": -0.6686033010482788,
"step": 80
},
{
"epoch": 0.17314629258517034,
"grad_norm": 6.5133260441754395,
"learning_rate": 9.839173742253334e-07,
"logits/chosen": -0.7489383816719055,
"logits/rejected": -0.781232476234436,
"logps/chosen": -296.9482116699219,
"logps/rejected": -327.5967712402344,
"loss": 0.6688,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.5791828036308289,
"rewards/margins": 0.188523530960083,
"rewards/rejected": -0.7677063345909119,
"step": 81
},
{
"epoch": 0.17528390113560455,
"grad_norm": 5.74672853077721,
"learning_rate": 9.82962913144534e-07,
"logits/chosen": -0.8432500958442688,
"logits/rejected": -0.8211543560028076,
"logps/chosen": -293.7790222167969,
"logps/rejected": -304.9800720214844,
"loss": 0.6522,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.47243934869766235,
"rewards/margins": 0.12751685082912445,
"rewards/rejected": -0.599956214427948,
"step": 82
},
{
"epoch": 0.17742150968603873,
"grad_norm": 6.3990299421699675,
"learning_rate": 9.819814303479267e-07,
"logits/chosen": -0.9426258206367493,
"logits/rejected": -0.9214622378349304,
"logps/chosen": -290.99407958984375,
"logps/rejected": -301.18212890625,
"loss": 0.652,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.49375712871551514,
"rewards/margins": 0.1566104143857956,
"rewards/rejected": -0.6503674983978271,
"step": 83
},
{
"epoch": 0.17955911823647294,
"grad_norm": 6.534280177132367,
"learning_rate": 9.80972980749353e-07,
"logits/chosen": -0.8522071838378906,
"logits/rejected": -0.8386092185974121,
"logps/chosen": -345.668212890625,
"logps/rejected": -346.40960693359375,
"loss": 0.67,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6266711950302124,
"rewards/margins": 0.09361431002616882,
"rewards/rejected": -0.7202855348587036,
"step": 84
},
{
"epoch": 0.18169672678690715,
"grad_norm": 6.649073031684906,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.7474217414855957,
"logits/rejected": -0.7404229044914246,
"logps/chosen": -275.940673828125,
"logps/rejected": -290.2484130859375,
"loss": 0.6365,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.41143321990966797,
"rewards/margins": 0.08391736447811127,
"rewards/rejected": -0.49535059928894043,
"step": 85
},
{
"epoch": 0.18383433533734134,
"grad_norm": 6.963291541132159,
"learning_rate": 9.788754083424652e-07,
"logits/chosen": -0.824079692363739,
"logits/rejected": -0.8041766285896301,
"logps/chosen": -321.2813720703125,
"logps/rejected": -339.7249450683594,
"loss": 0.6636,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5758030414581299,
"rewards/margins": 0.19611942768096924,
"rewards/rejected": -0.7719224095344543,
"step": 86
},
{
"epoch": 0.18597194388777555,
"grad_norm": 6.945463717696004,
"learning_rate": 9.777864028930705e-07,
"logits/chosen": -0.7686063647270203,
"logits/rejected": -0.7663296461105347,
"logps/chosen": -349.73004150390625,
"logps/rejected": -375.2843017578125,
"loss": 0.626,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6469497680664062,
"rewards/margins": 0.2732096314430237,
"rewards/rejected": -0.9201593399047852,
"step": 87
},
{
"epoch": 0.18810955243820976,
"grad_norm": 6.714366991925423,
"learning_rate": 9.766706653529812e-07,
"logits/chosen": -0.782423734664917,
"logits/rejected": -0.7881312966346741,
"logps/chosen": -301.2457275390625,
"logps/rejected": -310.0863037109375,
"loss": 0.6652,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5998435616493225,
"rewards/margins": 0.09575268626213074,
"rewards/rejected": -0.6955962777137756,
"step": 88
},
{
"epoch": 0.19024716098864394,
"grad_norm": 7.241214530195881,
"learning_rate": 9.755282581475767e-07,
"logits/chosen": -0.8655251860618591,
"logits/rejected": -0.8472452163696289,
"logps/chosen": -398.3143310546875,
"logps/rejected": -434.9195556640625,
"loss": 0.6159,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8902552723884583,
"rewards/margins": 0.29991066455841064,
"rewards/rejected": -1.1901659965515137,
"step": 89
},
{
"epoch": 0.19238476953907815,
"grad_norm": 7.90505927903396,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -0.8578193783760071,
"logits/rejected": -0.8561904430389404,
"logps/chosen": -281.01824951171875,
"logps/rejected": -304.5150146484375,
"loss": 0.6939,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.7082527875900269,
"rewards/margins": 0.08812718093395233,
"rewards/rejected": -0.7963800430297852,
"step": 90
},
{
"epoch": 0.19452237808951237,
"grad_norm": 7.5921079251944,
"learning_rate": 9.73163691899582e-07,
"logits/chosen": -0.678159236907959,
"logits/rejected": -0.6668828725814819,
"logps/chosen": -300.15338134765625,
"logps/rejected": -306.63525390625,
"loss": 0.6812,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6471429467201233,
"rewards/margins": 0.08188958466053009,
"rewards/rejected": -0.729032576084137,
"step": 91
},
{
"epoch": 0.19665998663994655,
"grad_norm": 7.137628936459269,
"learning_rate": 9.719416651541837e-07,
"logits/chosen": -0.8150886297225952,
"logits/rejected": -0.8088028430938721,
"logps/chosen": -431.6229248046875,
"logps/rejected": -458.9399108886719,
"loss": 0.643,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0566003322601318,
"rewards/margins": 0.2619227468967438,
"rewards/rejected": -1.3185230493545532,
"step": 92
},
{
"epoch": 0.19879759519038076,
"grad_norm": 6.729473146383851,
"learning_rate": 9.706932333304517e-07,
"logits/chosen": -0.8243950605392456,
"logits/rejected": -0.838744580745697,
"logps/chosen": -312.406494140625,
"logps/rejected": -335.4088134765625,
"loss": 0.6498,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.7556511759757996,
"rewards/margins": 0.031043091788887978,
"rewards/rejected": -0.7866942882537842,
"step": 93
},
{
"epoch": 0.20093520374081497,
"grad_norm": 6.624154045427617,
"learning_rate": 9.694184662779929e-07,
"logits/chosen": -0.783348560333252,
"logits/rejected": -0.7991134524345398,
"logps/chosen": -289.2900695800781,
"logps/rejected": -290.5962829589844,
"loss": 0.6525,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.6890867352485657,
"rewards/margins": 0.08654731512069702,
"rewards/rejected": -0.7756341099739075,
"step": 94
},
{
"epoch": 0.20307281229124916,
"grad_norm": 7.588312119029146,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -0.8928613066673279,
"logits/rejected": -0.9167020916938782,
"logps/chosen": -263.0621032714844,
"logps/rejected": -291.3228759765625,
"loss": 0.6785,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.6083306670188904,
"rewards/margins": 0.10731954127550125,
"rewards/rejected": -0.715650200843811,
"step": 95
},
{
"epoch": 0.20521042084168337,
"grad_norm": 7.93175048638323,
"learning_rate": 9.667902132486008e-07,
"logits/chosen": -0.7266509532928467,
"logits/rejected": -0.7005448341369629,
"logps/chosen": -355.4562072753906,
"logps/rejected": -368.688232421875,
"loss": 0.6808,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9357940554618835,
"rewards/margins": 0.1980743259191513,
"rewards/rejected": -1.1338684558868408,
"step": 96
},
{
"epoch": 0.20734802939211758,
"grad_norm": 7.282370392547328,
"learning_rate": 9.65436874322102e-07,
"logits/chosen": -0.7565743327140808,
"logits/rejected": -0.765534520149231,
"logps/chosen": -360.4274597167969,
"logps/rejected": -397.3307189941406,
"loss": 0.6365,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9580825567245483,
"rewards/margins": 0.27880433201789856,
"rewards/rejected": -1.2368868589401245,
"step": 97
},
{
"epoch": 0.20948563794255176,
"grad_norm": 7.307890632023091,
"learning_rate": 9.640574942595194e-07,
"logits/chosen": -0.6865275502204895,
"logits/rejected": -0.6510294079780579,
"logps/chosen": -299.5666198730469,
"logps/rejected": -315.7306823730469,
"loss": 0.637,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6532863974571228,
"rewards/margins": 0.1549152433872223,
"rewards/rejected": -0.8082016706466675,
"step": 98
},
{
"epoch": 0.21162324649298597,
"grad_norm": 7.447581281931192,
"learning_rate": 9.626521502369983e-07,
"logits/chosen": -0.6352126598358154,
"logits/rejected": -0.6191614866256714,
"logps/chosen": -293.2029113769531,
"logps/rejected": -306.13330078125,
"loss": 0.6658,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7388824224472046,
"rewards/margins": 0.15435971319675446,
"rewards/rejected": -0.8932421803474426,
"step": 99
},
{
"epoch": 0.21376085504342018,
"grad_norm": 6.648161187751906,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -0.7408478856086731,
"logits/rejected": -0.7513828277587891,
"logps/chosen": -301.5423583984375,
"logps/rejected": -345.68682861328125,
"loss": 0.628,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7927481532096863,
"rewards/margins": 0.24078083038330078,
"rewards/rejected": -1.0335289239883423,
"step": 100
},
{
"epoch": 0.21376085504342018,
"eval_logits/chosen": -0.7527643442153931,
"eval_logits/rejected": -0.7538674473762512,
"eval_logps/chosen": -343.6059875488281,
"eval_logps/rejected": -362.7133483886719,
"eval_loss": 0.6641345024108887,
"eval_rewards/accuracies": 0.6239837408065796,
"eval_rewards/chosen": -0.8805798888206482,
"eval_rewards/margins": 0.1340140700340271,
"eval_rewards/rejected": -1.0145939588546753,
"eval_runtime": 372.3126,
"eval_samples_per_second": 5.267,
"eval_steps_per_second": 0.33,
"step": 100
},
{
"epoch": 0.21589846359385437,
"grad_norm": 7.778718674441958,
"learning_rate": 9.597638862757253e-07,
"logits/chosen": -0.8201433420181274,
"logits/rejected": -0.8069182634353638,
"logps/chosen": -256.0120849609375,
"logps/rejected": -269.8443603515625,
"loss": 0.6831,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.656363844871521,
"rewards/margins": 0.05751778930425644,
"rewards/rejected": -0.7138815522193909,
"step": 101
},
{
"epoch": 0.21803607214428858,
"grad_norm": 7.5706021854045185,
"learning_rate": 9.58281127934988e-07,
"logits/chosen": -0.6860804557800293,
"logits/rejected": -0.7110453844070435,
"logps/chosen": -368.2939453125,
"logps/rejected": -393.86029052734375,
"loss": 0.6576,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.020776391029358,
"rewards/margins": 0.1516759693622589,
"rewards/rejected": -1.172452449798584,
"step": 102
},
{
"epoch": 0.2201736806947228,
"grad_norm": 8.607842129213472,
"learning_rate": 9.567727288213004e-07,
"logits/chosen": -0.7699592113494873,
"logits/rejected": -0.7589491605758667,
"logps/chosen": -324.6326904296875,
"logps/rejected": -358.59820556640625,
"loss": 0.7094,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.0056777000427246,
"rewards/margins": 0.18924392759799957,
"rewards/rejected": -1.1949217319488525,
"step": 103
},
{
"epoch": 0.22231128924515697,
"grad_norm": 7.291755560282041,
"learning_rate": 9.552387733294078e-07,
"logits/chosen": -0.6555180549621582,
"logits/rejected": -0.6659807562828064,
"logps/chosen": -330.6410827636719,
"logps/rejected": -359.6870422363281,
"loss": 0.6453,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8741835355758667,
"rewards/margins": 0.2183988094329834,
"rewards/rejected": -1.09258234500885,
"step": 104
},
{
"epoch": 0.22444889779559118,
"grad_norm": 7.775554579983475,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.6701323986053467,
"logits/rejected": -0.6589778661727905,
"logps/chosen": -285.3506164550781,
"logps/rejected": -288.3831481933594,
"loss": 0.6488,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.7929357290267944,
"rewards/margins": 0.09136777371168137,
"rewards/rejected": -0.8843034505844116,
"step": 105
},
{
"epoch": 0.2265865063460254,
"grad_norm": 7.266567693754681,
"learning_rate": 9.520945379345699e-07,
"logits/chosen": -0.8183209300041199,
"logits/rejected": -0.8361554741859436,
"logps/chosen": -397.4153747558594,
"logps/rejected": -423.17333984375,
"loss": 0.6383,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.1003904342651367,
"rewards/margins": 0.1266530454158783,
"rewards/rejected": -1.2270435094833374,
"step": 106
},
{
"epoch": 0.22872411489645958,
"grad_norm": 7.518282958403423,
"learning_rate": 9.504844339512094e-07,
"logits/chosen": -0.8879948854446411,
"logits/rejected": -0.8571330904960632,
"logps/chosen": -287.59051513671875,
"logps/rejected": -297.351318359375,
"loss": 0.6476,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7065630555152893,
"rewards/margins": 0.15939508378505707,
"rewards/rejected": -0.8659580945968628,
"step": 107
},
{
"epoch": 0.2308617234468938,
"grad_norm": 7.824671981101,
"learning_rate": 9.488491254189716e-07,
"logits/chosen": -0.8066489696502686,
"logits/rejected": -0.8055952191352844,
"logps/chosen": -404.3518981933594,
"logps/rejected": -442.6438903808594,
"loss": 0.6404,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0566518306732178,
"rewards/margins": 0.3284587264060974,
"rewards/rejected": -1.3851103782653809,
"step": 108
},
{
"epoch": 0.232999331997328,
"grad_norm": 8.83083617083813,
"learning_rate": 9.471887038331684e-07,
"logits/chosen": -0.7246598601341248,
"logits/rejected": -0.7441533207893372,
"logps/chosen": -354.1577453613281,
"logps/rejected": -366.261962890625,
"loss": 0.6873,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9086767435073853,
"rewards/margins": 0.14375557005405426,
"rewards/rejected": -1.0524324178695679,
"step": 109
},
{
"epoch": 0.23513694054776219,
"grad_norm": 6.762910416252425,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.7163547277450562,
"logits/rejected": -0.7031821608543396,
"logps/chosen": -281.1316833496094,
"logps/rejected": -283.372314453125,
"loss": 0.6652,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.6051456332206726,
"rewards/margins": 0.09137356281280518,
"rewards/rejected": -0.6965191960334778,
"step": 110
},
{
"epoch": 0.2372745490981964,
"grad_norm": 7.354791673779593,
"learning_rate": 9.43792894502277e-07,
"logits/chosen": -0.6413677334785461,
"logits/rejected": -0.6314007043838501,
"logps/chosen": -341.87396240234375,
"logps/rejected": -356.4854736328125,
"loss": 0.6642,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8350028395652771,
"rewards/margins": 0.21570800244808197,
"rewards/rejected": -1.050710916519165,
"step": 111
},
{
"epoch": 0.2394121576486306,
"grad_norm": 7.625646719699033,
"learning_rate": 9.420576967523048e-07,
"logits/chosen": -0.7540197968482971,
"logits/rejected": -0.7288798093795776,
"logps/chosen": -290.5899963378906,
"logps/rejected": -294.30804443359375,
"loss": 0.6563,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6742240786552429,
"rewards/margins": 0.1976398229598999,
"rewards/rejected": -0.8718639612197876,
"step": 112
},
{
"epoch": 0.2415497661990648,
"grad_norm": 7.749312449639858,
"learning_rate": 9.402977659283689e-07,
"logits/chosen": -0.773981511592865,
"logits/rejected": -0.7674249410629272,
"logps/chosen": -323.57000732421875,
"logps/rejected": -349.71990966796875,
"loss": 0.6365,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7962589859962463,
"rewards/margins": 0.18393486738204956,
"rewards/rejected": -0.9801937937736511,
"step": 113
},
{
"epoch": 0.243687374749499,
"grad_norm": 7.4503816098925055,
"learning_rate": 9.385132004983832e-07,
"logits/chosen": -0.7875250577926636,
"logits/rejected": -0.7886217832565308,
"logps/chosen": -289.820068359375,
"logps/rejected": -307.18914794921875,
"loss": 0.6351,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6762690544128418,
"rewards/margins": 0.16321714222431183,
"rewards/rejected": -0.8394861817359924,
"step": 114
},
{
"epoch": 0.2458249832999332,
"grad_norm": 7.383473143295883,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -0.811254620552063,
"logits/rejected": -0.8413273692131042,
"logps/chosen": -328.42877197265625,
"logps/rejected": -360.35430908203125,
"loss": 0.6373,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6836004257202148,
"rewards/margins": 0.1819692850112915,
"rewards/rejected": -0.8655696511268616,
"step": 115
},
{
"epoch": 0.2479625918503674,
"grad_norm": 6.933138308165148,
"learning_rate": 9.348705665778477e-07,
"logits/chosen": -0.7606134414672852,
"logits/rejected": -0.7490028142929077,
"logps/chosen": -342.7862548828125,
"logps/rejected": -355.22943115234375,
"loss": 0.6449,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.9342496991157532,
"rewards/margins": 0.09783473610877991,
"rewards/rejected": -1.0320844650268555,
"step": 116
},
{
"epoch": 0.2501002004008016,
"grad_norm": 6.9991891498789744,
"learning_rate": 9.330127018922193e-07,
"logits/chosen": -0.7081186771392822,
"logits/rejected": -0.7329989075660706,
"logps/chosen": -361.0794372558594,
"logps/rejected": -369.33721923828125,
"loss": 0.6483,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8992618322372437,
"rewards/margins": 0.12093706429004669,
"rewards/rejected": -1.020198941230774,
"step": 117
},
{
"epoch": 0.2522378089512358,
"grad_norm": 6.891968237145087,
"learning_rate": 9.311306101989812e-07,
"logits/chosen": -0.7707226872444153,
"logits/rejected": -0.775468111038208,
"logps/chosen": -328.4278869628906,
"logps/rejected": -375.73712158203125,
"loss": 0.6215,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7947558164596558,
"rewards/margins": 0.2816506326198578,
"rewards/rejected": -1.076406478881836,
"step": 118
},
{
"epoch": 0.25437541750167003,
"grad_norm": 7.78238688784484,
"learning_rate": 9.29224396800933e-07,
"logits/chosen": -0.8061501383781433,
"logits/rejected": -0.7823886275291443,
"logps/chosen": -322.4601135253906,
"logps/rejected": -329.06744384765625,
"loss": 0.6551,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8091481924057007,
"rewards/margins": -0.014971929602324963,
"rewards/rejected": -0.7941762208938599,
"step": 119
},
{
"epoch": 0.2565130260521042,
"grad_norm": 7.538013946361546,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -0.7215307950973511,
"logits/rejected": -0.7078826427459717,
"logps/chosen": -356.06005859375,
"logps/rejected": -357.49774169921875,
"loss": 0.6384,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8885184526443481,
"rewards/margins": 0.18230639398097992,
"rewards/rejected": -1.0708248615264893,
"step": 120
},
{
"epoch": 0.2586506346025384,
"grad_norm": 7.545420813102953,
"learning_rate": 9.253400328436698e-07,
"logits/chosen": -0.7346601486206055,
"logits/rejected": -0.7335522174835205,
"logps/chosen": -344.805419921875,
"logps/rejected": -350.87725830078125,
"loss": 0.6594,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8253999948501587,
"rewards/margins": 0.0806780755519867,
"rewards/rejected": -0.9060779809951782,
"step": 121
},
{
"epoch": 0.2607882431529726,
"grad_norm": 7.7074461457899535,
"learning_rate": 9.233620996141421e-07,
"logits/chosen": -0.8815721273422241,
"logits/rejected": -0.8621220588684082,
"logps/chosen": -336.6763610839844,
"logps/rejected": -341.74798583984375,
"loss": 0.6341,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7817858457565308,
"rewards/margins": 0.07886642962694168,
"rewards/rejected": -0.8606522083282471,
"step": 122
},
{
"epoch": 0.2629258517034068,
"grad_norm": 7.761484323525846,
"learning_rate": 9.213604793270196e-07,
"logits/chosen": -0.8222033977508545,
"logits/rejected": -0.8148404955863953,
"logps/chosen": -303.2247009277344,
"logps/rejected": -315.91888427734375,
"loss": 0.6419,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7392472624778748,
"rewards/margins": 0.13012456893920898,
"rewards/rejected": -0.869371771812439,
"step": 123
},
{
"epoch": 0.26506346025384103,
"grad_norm": 8.151352928349633,
"learning_rate": 9.19335283972712e-07,
"logits/chosen": -0.7656688690185547,
"logits/rejected": -0.7709140181541443,
"logps/chosen": -374.8747253417969,
"logps/rejected": -376.098876953125,
"loss": 0.6754,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.005488634109497,
"rewards/margins": 0.06390834599733353,
"rewards/rejected": -1.06939697265625,
"step": 124
},
{
"epoch": 0.26720106880427524,
"grad_norm": 7.63262703375028,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -0.755223274230957,
"logits/rejected": -0.7677374482154846,
"logps/chosen": -372.7818603515625,
"logps/rejected": -385.9354248046875,
"loss": 0.6662,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8667163848876953,
"rewards/margins": 0.20413543283939362,
"rewards/rejected": -1.0708518028259277,
"step": 125
},
{
"epoch": 0.2693386773547094,
"grad_norm": 7.478834701874977,
"learning_rate": 9.152146226129518e-07,
"logits/chosen": -0.7996259927749634,
"logits/rejected": -0.7835624814033508,
"logps/chosen": -292.76129150390625,
"logps/rejected": -333.20477294921875,
"loss": 0.6172,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7281415462493896,
"rewards/margins": 0.3209742605686188,
"rewards/rejected": -1.049115777015686,
"step": 126
},
{
"epoch": 0.2714762859051436,
"grad_norm": 7.082169803011623,
"learning_rate": 9.131193871579974e-07,
"logits/chosen": -0.8138784766197205,
"logits/rejected": -0.829187273979187,
"logps/chosen": -353.7518615722656,
"logps/rejected": -404.1153564453125,
"loss": 0.6436,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8756864070892334,
"rewards/margins": 0.2420441061258316,
"rewards/rejected": -1.1177304983139038,
"step": 127
},
{
"epoch": 0.2736138944555778,
"grad_norm": 7.237862112577957,
"learning_rate": 9.11001037723955e-07,
"logits/chosen": -0.7936111688613892,
"logits/rejected": -0.8008431196212769,
"logps/chosen": -332.17718505859375,
"logps/rejected": -352.5616760253906,
"loss": 0.6689,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7826520800590515,
"rewards/margins": 0.20596377551555634,
"rewards/rejected": -0.988615870475769,
"step": 128
},
{
"epoch": 0.27575150300601203,
"grad_norm": 8.604847241866956,
"learning_rate": 9.088596928322157e-07,
"logits/chosen": -0.8067824840545654,
"logits/rejected": -0.8039845824241638,
"logps/chosen": -333.2156982421875,
"logps/rejected": -357.6597595214844,
"loss": 0.6587,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.787762463092804,
"rewards/margins": 0.017246991395950317,
"rewards/rejected": -0.8050093650817871,
"step": 129
},
{
"epoch": 0.27788911155644624,
"grad_norm": 8.324207089057085,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -0.6775297522544861,
"logits/rejected": -0.7070217132568359,
"logps/chosen": -324.43402099609375,
"logps/rejected": -350.53851318359375,
"loss": 0.645,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7566977739334106,
"rewards/margins": 0.24282482266426086,
"rewards/rejected": -0.9995224475860596,
"step": 130
},
{
"epoch": 0.2800267201068804,
"grad_norm": 7.364170729863742,
"learning_rate": 9.045084971874737e-07,
"logits/chosen": -0.7260534167289734,
"logits/rejected": -0.7187973260879517,
"logps/chosen": -294.4010925292969,
"logps/rejected": -310.21136474609375,
"loss": 0.6398,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.7721171379089355,
"rewards/margins": 0.15222765505313873,
"rewards/rejected": -0.9243447184562683,
"step": 131
},
{
"epoch": 0.2821643286573146,
"grad_norm": 7.35116325693309,
"learning_rate": 9.022988898833342e-07,
"logits/chosen": -0.7463628053665161,
"logits/rejected": -0.7459514141082764,
"logps/chosen": -329.623779296875,
"logps/rejected": -356.4615783691406,
"loss": 0.5991,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8810457587242126,
"rewards/margins": 0.1869642734527588,
"rewards/rejected": -1.0680099725723267,
"step": 132
},
{
"epoch": 0.2843019372077488,
"grad_norm": 7.43517337943034,
"learning_rate": 9.000667740056032e-07,
"logits/chosen": -0.7253285646438599,
"logits/rejected": -0.7020008563995361,
"logps/chosen": -341.2428894042969,
"logps/rejected": -399.8907470703125,
"loss": 0.6251,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9917829036712646,
"rewards/margins": 0.3243829011917114,
"rewards/rejected": -1.3161659240722656,
"step": 133
},
{
"epoch": 0.28643954575818303,
"grad_norm": 8.02042016596172,
"learning_rate": 8.978122744408905e-07,
"logits/chosen": -0.6935924887657166,
"logits/rejected": -0.6478650569915771,
"logps/chosen": -383.7857971191406,
"logps/rejected": -403.4067077636719,
"loss": 0.6472,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.0208508968353271,
"rewards/margins": 0.2477567493915558,
"rewards/rejected": -1.2686076164245605,
"step": 134
},
{
"epoch": 0.28857715430861725,
"grad_norm": 7.085674453391065,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -0.7271559238433838,
"logits/rejected": -0.7309106588363647,
"logps/chosen": -310.3777160644531,
"logps/rejected": -329.25323486328125,
"loss": 0.6007,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.7959171533584595,
"rewards/margins": 0.21840126812458038,
"rewards/rejected": -1.0143184661865234,
"step": 135
},
{
"epoch": 0.29071476285905146,
"grad_norm": 7.837723075107936,
"learning_rate": 8.932366300517249e-07,
"logits/chosen": -0.771674633026123,
"logits/rejected": -0.7675716280937195,
"logps/chosen": -381.0829772949219,
"logps/rejected": -408.50616455078125,
"loss": 0.6332,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0345312356948853,
"rewards/margins": 0.18286427855491638,
"rewards/rejected": -1.217395544052124,
"step": 136
},
{
"epoch": 0.2928523714094856,
"grad_norm": 9.181399284387098,
"learning_rate": 8.909157412340149e-07,
"logits/chosen": -0.837311863899231,
"logits/rejected": -0.8280692100524902,
"logps/chosen": -368.6721496582031,
"logps/rejected": -397.39056396484375,
"loss": 0.672,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.0871038436889648,
"rewards/margins": 0.1502176821231842,
"rewards/rejected": -1.2373214960098267,
"step": 137
},
{
"epoch": 0.2949899799599198,
"grad_norm": 8.547486747659995,
"learning_rate": 8.885729807284854e-07,
"logits/chosen": -0.6511350274085999,
"logits/rejected": -0.6316956877708435,
"logps/chosen": -367.9530029296875,
"logps/rejected": -376.24652099609375,
"loss": 0.6666,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1004064083099365,
"rewards/margins": 0.1894245594739914,
"rewards/rejected": -1.2898309230804443,
"step": 138
},
{
"epoch": 0.29712758851035403,
"grad_norm": 7.209713050210504,
"learning_rate": 8.862084796122997e-07,
"logits/chosen": -0.7271043658256531,
"logits/rejected": -0.7313827276229858,
"logps/chosen": -305.42919921875,
"logps/rejected": -366.8320007324219,
"loss": 0.6285,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8596370816230774,
"rewards/margins": 0.34605735540390015,
"rewards/rejected": -1.205694556236267,
"step": 139
},
{
"epoch": 0.29926519706078825,
"grad_norm": 8.777840373189521,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -0.8329405188560486,
"logits/rejected": -0.8425594568252563,
"logps/chosen": -334.819580078125,
"logps/rejected": -353.1991882324219,
"loss": 0.6789,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9176943898200989,
"rewards/margins": 0.08534470945596695,
"rewards/rejected": -1.0030391216278076,
"step": 140
},
{
"epoch": 0.30140280561122246,
"grad_norm": 8.349695032017713,
"learning_rate": 8.814147859311332e-07,
"logits/chosen": -0.7287541031837463,
"logits/rejected": -0.747150182723999,
"logps/chosen": -338.96990966796875,
"logps/rejected": -393.1916809082031,
"loss": 0.6085,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.868696928024292,
"rewards/margins": 0.3027462959289551,
"rewards/rejected": -1.1714433431625366,
"step": 141
},
{
"epoch": 0.30354041416165667,
"grad_norm": 8.507916003943253,
"learning_rate": 8.789858615727264e-07,
"logits/chosen": -0.6775808930397034,
"logits/rejected": -0.6213993430137634,
"logps/chosen": -374.7777099609375,
"logps/rejected": -441.28265380859375,
"loss": 0.5921,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.076945424079895,
"rewards/margins": 0.4260719418525696,
"rewards/rejected": -1.5030174255371094,
"step": 142
},
{
"epoch": 0.3056780227120908,
"grad_norm": 8.266582578388473,
"learning_rate": 8.765357330018055e-07,
"logits/chosen": -0.7523927092552185,
"logits/rejected": -0.7748714685440063,
"logps/chosen": -353.6466064453125,
"logps/rejected": -402.60662841796875,
"loss": 0.625,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0103652477264404,
"rewards/margins": 0.3086761236190796,
"rewards/rejected": -1.31904137134552,
"step": 143
},
{
"epoch": 0.30781563126252504,
"grad_norm": 8.078736110217639,
"learning_rate": 8.740645373027634e-07,
"logits/chosen": -0.72418212890625,
"logits/rejected": -0.7301138639450073,
"logps/chosen": -414.23004150390625,
"logps/rejected": -465.2354736328125,
"loss": 0.6034,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1818435192108154,
"rewards/margins": 0.26852208375930786,
"rewards/rejected": -1.450365662574768,
"step": 144
},
{
"epoch": 0.30995323981295925,
"grad_norm": 8.551096723015775,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -0.7613222599029541,
"logits/rejected": -0.7519202828407288,
"logps/chosen": -376.8845520019531,
"logps/rejected": -391.2483215332031,
"loss": 0.6422,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2536505460739136,
"rewards/margins": 0.05302443727850914,
"rewards/rejected": -1.3066749572753906,
"step": 145
},
{
"epoch": 0.31209084836339346,
"grad_norm": 10.264598567431593,
"learning_rate": 8.690594987436704e-07,
"logits/chosen": -0.6667072772979736,
"logits/rejected": -0.651785135269165,
"logps/chosen": -407.5121765136719,
"logps/rejected": -414.15325927734375,
"loss": 0.7022,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3455419540405273,
"rewards/margins": 0.1597069501876831,
"rewards/rejected": -1.5052489042282104,
"step": 146
},
{
"epoch": 0.31422845691382767,
"grad_norm": 8.003275854261016,
"learning_rate": 8.66525935914913e-07,
"logits/chosen": -0.70644611120224,
"logits/rejected": -0.7072776556015015,
"logps/chosen": -298.8578186035156,
"logps/rejected": -352.6321105957031,
"loss": 0.6026,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7530163526535034,
"rewards/margins": 0.3677568733692169,
"rewards/rejected": -1.1207730770111084,
"step": 147
},
{
"epoch": 0.3163660654642619,
"grad_norm": 9.622881147148561,
"learning_rate": 8.639718660049554e-07,
"logits/chosen": -0.7758994102478027,
"logits/rejected": -0.7696230411529541,
"logps/chosen": -305.4625549316406,
"logps/rejected": -307.0013732910156,
"loss": 0.6654,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.9157548546791077,
"rewards/margins": 0.10377232730388641,
"rewards/rejected": -1.0195271968841553,
"step": 148
},
{
"epoch": 0.31850367401469604,
"grad_norm": 9.830442606055694,
"learning_rate": 8.613974319136957e-07,
"logits/chosen": -0.6808797121047974,
"logits/rejected": -0.6591075658798218,
"logps/chosen": -328.95526123046875,
"logps/rejected": -344.4721374511719,
"loss": 0.653,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.1110159158706665,
"rewards/margins": 0.16696244478225708,
"rewards/rejected": -1.2779783010482788,
"step": 149
},
{
"epoch": 0.32064128256513025,
"grad_norm": 8.745762248320213,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -0.7875892519950867,
"logits/rejected": -0.7677904963493347,
"logps/chosen": -357.419677734375,
"logps/rejected": -373.40460205078125,
"loss": 0.6461,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0266187191009521,
"rewards/margins": 0.21186724305152893,
"rewards/rejected": -1.2384859323501587,
"step": 150
},
{
"epoch": 0.32277889111556446,
"grad_norm": 8.312609091320738,
"learning_rate": 8.561880484756724e-07,
"logits/chosen": -0.7948569059371948,
"logits/rejected": -0.7845500707626343,
"logps/chosen": -341.4780578613281,
"logps/rejected": -384.87615966796875,
"loss": 0.622,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.9681000113487244,
"rewards/margins": 0.3343212306499481,
"rewards/rejected": -1.30242121219635,
"step": 151
},
{
"epoch": 0.32491649966599867,
"grad_norm": 9.180312248349901,
"learning_rate": 8.535533905932737e-07,
"logits/chosen": -0.7717313170433044,
"logits/rejected": -0.7632758617401123,
"logps/chosen": -349.5531921386719,
"logps/rejected": -348.3159484863281,
"loss": 0.6628,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1458628177642822,
"rewards/margins": 0.07357059419155121,
"rewards/rejected": -1.2194334268569946,
"step": 152
},
{
"epoch": 0.3270541082164329,
"grad_norm": 8.40986643995496,
"learning_rate": 8.508989514419958e-07,
"logits/chosen": -0.6287474036216736,
"logits/rejected": -0.5992534160614014,
"logps/chosen": -327.4925842285156,
"logps/rejected": -357.3055725097656,
"loss": 0.6299,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0919466018676758,
"rewards/margins": 0.2580554485321045,
"rewards/rejected": -1.3500020503997803,
"step": 153
},
{
"epoch": 0.3291917167668671,
"grad_norm": 9.217933303499299,
"learning_rate": 8.482248795373835e-07,
"logits/chosen": -0.7915253639221191,
"logits/rejected": -0.7664984464645386,
"logps/chosen": -368.6262512207031,
"logps/rejected": -391.03564453125,
"loss": 0.6426,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.0133848190307617,
"rewards/margins": 0.11703261733055115,
"rewards/rejected": -1.1304173469543457,
"step": 154
},
{
"epoch": 0.33132932531730125,
"grad_norm": 8.472153097719477,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -0.8312329649925232,
"logits/rejected": -0.8426264524459839,
"logps/chosen": -341.4083251953125,
"logps/rejected": -377.6736145019531,
"loss": 0.6149,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0413603782653809,
"rewards/margins": 0.24394717812538147,
"rewards/rejected": -1.2853076457977295,
"step": 155
},
{
"epoch": 0.33346693386773546,
"grad_norm": 8.186342714745207,
"learning_rate": 8.428184370142171e-07,
"logits/chosen": -0.6921215653419495,
"logits/rejected": -0.7096705436706543,
"logps/chosen": -363.21539306640625,
"logps/rejected": -384.2535400390625,
"loss": 0.6144,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9978117346763611,
"rewards/margins": 0.18856188654899597,
"rewards/rejected": -1.1863737106323242,
"step": 156
},
{
"epoch": 0.33560454241816967,
"grad_norm": 8.626047256669759,
"learning_rate": 8.400863688854596e-07,
"logits/chosen": -0.8120739459991455,
"logits/rejected": -0.8196284770965576,
"logps/chosen": -347.4595947265625,
"logps/rejected": -357.397705078125,
"loss": 0.6446,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0381972789764404,
"rewards/margins": 0.1279696524143219,
"rewards/rejected": -1.1661670207977295,
"step": 157
},
{
"epoch": 0.3377421509686039,
"grad_norm": 11.860996272985476,
"learning_rate": 8.373352729660372e-07,
"logits/chosen": -0.7756985425949097,
"logits/rejected": -0.7191120386123657,
"logps/chosen": -395.55401611328125,
"logps/rejected": -403.5904541015625,
"loss": 0.6526,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2536171674728394,
"rewards/margins": 0.026868807151913643,
"rewards/rejected": -1.280485987663269,
"step": 158
},
{
"epoch": 0.3398797595190381,
"grad_norm": 9.122376865267006,
"learning_rate": 8.34565303179429e-07,
"logits/chosen": -0.8109874725341797,
"logits/rejected": -0.784782886505127,
"logps/chosen": -349.2673645019531,
"logps/rejected": -355.31427001953125,
"loss": 0.6482,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1418871879577637,
"rewards/margins": 0.0541604682803154,
"rewards/rejected": -1.1960475444793701,
"step": 159
},
{
"epoch": 0.3420173680694723,
"grad_norm": 10.446536418824028,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -0.8555909395217896,
"logits/rejected": -0.8299651145935059,
"logps/chosen": -393.7392272949219,
"logps/rejected": -433.85565185546875,
"loss": 0.68,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2185660600662231,
"rewards/margins": 0.22462578117847443,
"rewards/rejected": -1.4431917667388916,
"step": 160
},
{
"epoch": 0.34415497661990646,
"grad_norm": 7.825411706225049,
"learning_rate": 8.289693629698563e-07,
"logits/chosen": -0.7958833575248718,
"logits/rejected": -0.8027774095535278,
"logps/chosen": -402.79913330078125,
"logps/rejected": -437.49591064453125,
"loss": 0.6203,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1367416381835938,
"rewards/margins": 0.3376201391220093,
"rewards/rejected": -1.4743616580963135,
"step": 161
},
{
"epoch": 0.34629258517034067,
"grad_norm": 9.034553362218846,
"learning_rate": 8.261437056390606e-07,
"logits/chosen": -0.697302520275116,
"logits/rejected": -0.6625763773918152,
"logps/chosen": -349.05950927734375,
"logps/rejected": -353.0817565917969,
"loss": 0.6857,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9678500294685364,
"rewards/margins": 0.10605783760547638,
"rewards/rejected": -1.0739078521728516,
"step": 162
},
{
"epoch": 0.3484301937207749,
"grad_norm": 8.737777630064887,
"learning_rate": 8.232998006078997e-07,
"logits/chosen": -0.674803614616394,
"logits/rejected": -0.6823403835296631,
"logps/chosen": -358.0148620605469,
"logps/rejected": -384.6661071777344,
"loss": 0.6235,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1485981941223145,
"rewards/margins": 0.25151392817497253,
"rewards/rejected": -1.4001121520996094,
"step": 163
},
{
"epoch": 0.3505678022712091,
"grad_norm": 9.343701031382219,
"learning_rate": 8.20437806992512e-07,
"logits/chosen": -0.7436198592185974,
"logits/rejected": -0.7431969046592712,
"logps/chosen": -316.6277770996094,
"logps/rejected": -367.4931335449219,
"loss": 0.6664,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0274879932403564,
"rewards/margins": 0.1997983604669571,
"rewards/rejected": -1.2272862195968628,
"step": 164
},
{
"epoch": 0.3527054108216433,
"grad_norm": 8.418205426089232,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -0.7993863224983215,
"logits/rejected": -0.7827702164649963,
"logps/chosen": -393.60791015625,
"logps/rejected": -424.71124267578125,
"loss": 0.6392,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1842068433761597,
"rewards/margins": 0.3191227316856384,
"rewards/rejected": -1.5033295154571533,
"step": 165
},
{
"epoch": 0.35484301937207746,
"grad_norm": 8.984910438057955,
"learning_rate": 8.146601955249187e-07,
"logits/chosen": -0.7122502326965332,
"logits/rejected": -0.7099603414535522,
"logps/chosen": -365.7021179199219,
"logps/rejected": -365.78912353515625,
"loss": 0.6637,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2422032356262207,
"rewards/margins": -0.04617507755756378,
"rewards/rejected": -1.196028232574463,
"step": 166
},
{
"epoch": 0.3569806279225117,
"grad_norm": 7.926871474687121,
"learning_rate": 8.117449009293668e-07,
"logits/chosen": -0.7609111666679382,
"logits/rejected": -0.7424649000167847,
"logps/chosen": -367.951416015625,
"logps/rejected": -388.4793395996094,
"loss": 0.6288,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1471714973449707,
"rewards/margins": 0.1946582943201065,
"rewards/rejected": -1.341829776763916,
"step": 167
},
{
"epoch": 0.3591182364729459,
"grad_norm": 8.370915442021108,
"learning_rate": 8.088121642448089e-07,
"logits/chosen": -0.7230314016342163,
"logits/rejected": -0.7338634729385376,
"logps/chosen": -383.22216796875,
"logps/rejected": -422.01068115234375,
"loss": 0.6387,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0255863666534424,
"rewards/margins": 0.5024391412734985,
"rewards/rejected": -1.528025507926941,
"step": 168
},
{
"epoch": 0.3612558450233801,
"grad_norm": 8.13714962488371,
"learning_rate": 8.058621495575031e-07,
"logits/chosen": -0.6844447255134583,
"logits/rejected": -0.6487768888473511,
"logps/chosen": -350.7132568359375,
"logps/rejected": -367.8271484375,
"loss": 0.6105,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.101859450340271,
"rewards/margins": 0.21480971574783325,
"rewards/rejected": -1.316669225692749,
"step": 169
},
{
"epoch": 0.3633934535738143,
"grad_norm": 9.892000882662467,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -0.5773683190345764,
"logits/rejected": -0.5765209794044495,
"logps/chosen": -370.47796630859375,
"logps/rejected": -415.53167724609375,
"loss": 0.5997,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1410350799560547,
"rewards/margins": 0.3918205499649048,
"rewards/rejected": -1.532855749130249,
"step": 170
},
{
"epoch": 0.3655310621242485,
"grad_norm": 8.106890489682133,
"learning_rate": 7.999109473439569e-07,
"logits/chosen": -0.6529942154884338,
"logits/rejected": -0.6343085169792175,
"logps/chosen": -358.777099609375,
"logps/rejected": -388.01995849609375,
"loss": 0.6249,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.1139953136444092,
"rewards/margins": 0.19699575006961823,
"rewards/rejected": -1.3109909296035767,
"step": 171
},
{
"epoch": 0.3676686706746827,
"grad_norm": 7.818768691894028,
"learning_rate": 7.969100927867507e-07,
"logits/chosen": -0.7647715210914612,
"logits/rejected": -0.768187940120697,
"logps/chosen": -315.3676452636719,
"logps/rejected": -344.0660095214844,
"loss": 0.6095,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.9741207361221313,
"rewards/margins": 0.2204282581806183,
"rewards/rejected": -1.1945490837097168,
"step": 172
},
{
"epoch": 0.3698062792251169,
"grad_norm": 8.22923262916057,
"learning_rate": 7.938926261462365e-07,
"logits/chosen": -0.7851884961128235,
"logits/rejected": -0.8041540384292603,
"logps/chosen": -318.61712646484375,
"logps/rejected": -398.345458984375,
"loss": 0.5961,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9148141741752625,
"rewards/margins": 0.5934224128723145,
"rewards/rejected": -1.5082364082336426,
"step": 173
},
{
"epoch": 0.3719438877755511,
"grad_norm": 8.000153942367305,
"learning_rate": 7.908587162493028e-07,
"logits/chosen": -0.6852933168411255,
"logits/rejected": -0.6849787831306458,
"logps/chosen": -406.8155212402344,
"logps/rejected": -447.3205261230469,
"loss": 0.6301,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2369593381881714,
"rewards/margins": 0.23439320921897888,
"rewards/rejected": -1.4713525772094727,
"step": 174
},
{
"epoch": 0.3740814963259853,
"grad_norm": 9.316195243095242,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -0.7409089803695679,
"logits/rejected": -0.7157390713691711,
"logps/chosen": -338.6581115722656,
"logps/rejected": -357.2071838378906,
"loss": 0.646,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.051304817199707,
"rewards/margins": 0.1540244072675705,
"rewards/rejected": -1.2053292989730835,
"step": 175
},
{
"epoch": 0.3762191048764195,
"grad_norm": 9.548217777892644,
"learning_rate": 7.84742246584226e-07,
"logits/chosen": -0.644868016242981,
"logits/rejected": -0.6358535885810852,
"logps/chosen": -280.54644775390625,
"logps/rejected": -320.94866943359375,
"loss": 0.6304,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9811806082725525,
"rewards/margins": 0.36237311363220215,
"rewards/rejected": -1.3435536623001099,
"step": 176
},
{
"epoch": 0.37835671342685373,
"grad_norm": 8.213256746442323,
"learning_rate": 7.81660029031811e-07,
"logits/chosen": -0.7351135015487671,
"logits/rejected": -0.7099937796592712,
"logps/chosen": -403.18609619140625,
"logps/rejected": -427.598876953125,
"loss": 0.6286,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3992477655410767,
"rewards/margins": 0.21843519806861877,
"rewards/rejected": -1.6176831722259521,
"step": 177
},
{
"epoch": 0.3804943219772879,
"grad_norm": 9.877087816575154,
"learning_rate": 7.785620526352861e-07,
"logits/chosen": -0.6065413355827332,
"logits/rejected": -0.6187620759010315,
"logps/chosen": -417.3489074707031,
"logps/rejected": -418.964599609375,
"loss": 0.6396,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4968540668487549,
"rewards/margins": 0.15608513355255127,
"rewards/rejected": -1.6529392004013062,
"step": 178
},
{
"epoch": 0.3826319305277221,
"grad_norm": 9.518985768520599,
"learning_rate": 7.754484907260512e-07,
"logits/chosen": -0.6335625648498535,
"logits/rejected": -0.6501979231834412,
"logps/chosen": -320.66973876953125,
"logps/rejected": -377.0364990234375,
"loss": 0.6203,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.9643224477767944,
"rewards/margins": 0.4927278459072113,
"rewards/rejected": -1.4570502042770386,
"step": 179
},
{
"epoch": 0.3847695390781563,
"grad_norm": 8.327661579909856,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -0.8008890748023987,
"logits/rejected": -0.8070433735847473,
"logps/chosen": -385.35711669921875,
"logps/rejected": -417.4620056152344,
"loss": 0.63,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1224894523620605,
"rewards/margins": 0.2396533489227295,
"rewards/rejected": -1.36214280128479,
"step": 180
},
{
"epoch": 0.3869071476285905,
"grad_norm": 8.177222796415196,
"learning_rate": 7.691753080453411e-07,
"logits/chosen": -0.7654060125350952,
"logits/rejected": -0.7563324570655823,
"logps/chosen": -372.6927185058594,
"logps/rejected": -392.412841796875,
"loss": 0.6178,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2625975608825684,
"rewards/margins": 0.1758994460105896,
"rewards/rejected": -1.4384969472885132,
"step": 181
},
{
"epoch": 0.38904475617902473,
"grad_norm": 8.268029500490163,
"learning_rate": 7.660160382576683e-07,
"logits/chosen": -0.8044633865356445,
"logits/rejected": -0.8295111060142517,
"logps/chosen": -387.167724609375,
"logps/rejected": -421.73223876953125,
"loss": 0.6057,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2140891551971436,
"rewards/margins": 0.26494261622428894,
"rewards/rejected": -1.4790318012237549,
"step": 182
},
{
"epoch": 0.39118236472945894,
"grad_norm": 8.205646872076233,
"learning_rate": 7.628418849052523e-07,
"logits/chosen": -0.7259032726287842,
"logits/rejected": -0.7147877812385559,
"logps/chosen": -332.19952392578125,
"logps/rejected": -358.916748046875,
"loss": 0.6335,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2255278825759888,
"rewards/margins": 0.18311724066734314,
"rewards/rejected": -1.4086451530456543,
"step": 183
},
{
"epoch": 0.3933199732798931,
"grad_norm": 9.273642724003066,
"learning_rate": 7.596530255815845e-07,
"logits/chosen": -0.6111272573471069,
"logits/rejected": -0.6174825429916382,
"logps/chosen": -431.119140625,
"logps/rejected": -474.2237854003906,
"loss": 0.5914,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.340221643447876,
"rewards/margins": 0.47385597229003906,
"rewards/rejected": -1.814077615737915,
"step": 184
},
{
"epoch": 0.3954575818303273,
"grad_norm": 9.03001535554559,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -0.5710060000419617,
"logits/rejected": -0.6027272343635559,
"logps/chosen": -402.5880432128906,
"logps/rejected": -461.81390380859375,
"loss": 0.5815,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2832955121994019,
"rewards/margins": 0.44037097692489624,
"rewards/rejected": -1.7236665487289429,
"step": 185
},
{
"epoch": 0.3975951903807615,
"grad_norm": 10.65885585737553,
"learning_rate": 7.532319034984614e-07,
"logits/chosen": -0.6792325377464294,
"logits/rejected": -0.7070844769477844,
"logps/chosen": -345.3462219238281,
"logps/rejected": -380.2834167480469,
"loss": 0.606,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0898866653442383,
"rewards/margins": 0.2850308120250702,
"rewards/rejected": -1.3749175071716309,
"step": 186
},
{
"epoch": 0.39973279893119573,
"grad_norm": 10.832365903487103,
"learning_rate": 7.5e-07,
"logits/chosen": -0.6209002733230591,
"logits/rejected": -0.5837200880050659,
"logps/chosen": -448.4324035644531,
"logps/rejected": -443.7789306640625,
"loss": 0.6723,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.5713595151901245,
"rewards/margins": 0.1559199094772339,
"rewards/rejected": -1.7272793054580688,
"step": 187
},
{
"epoch": 0.40187040748162994,
"grad_norm": 8.78650176694853,
"learning_rate": 7.467541090321733e-07,
"logits/chosen": -0.6626260876655579,
"logits/rejected": -0.6681480407714844,
"logps/chosen": -357.17535400390625,
"logps/rejected": -392.3631591796875,
"loss": 0.6327,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0751440525054932,
"rewards/margins": 0.28642958402633667,
"rewards/rejected": -1.361573576927185,
"step": 188
},
{
"epoch": 0.40400801603206415,
"grad_norm": 9.268270452493919,
"learning_rate": 7.434944122021836e-07,
"logits/chosen": -0.7080458402633667,
"logits/rejected": -0.6918138861656189,
"logps/chosen": -428.0231628417969,
"logps/rejected": -447.84588623046875,
"loss": 0.5866,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2396382093429565,
"rewards/margins": 0.2672892212867737,
"rewards/rejected": -1.506927490234375,
"step": 189
},
{
"epoch": 0.4061456245824983,
"grad_norm": 9.083796097067385,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -0.6990772485733032,
"logits/rejected": -0.6821334362030029,
"logps/chosen": -330.4400329589844,
"logps/rejected": -351.8786926269531,
"loss": 0.5944,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.078837275505066,
"rewards/margins": 0.31049594283103943,
"rewards/rejected": -1.3893331289291382,
"step": 190
},
{
"epoch": 0.4082832331329325,
"grad_norm": 8.342069569715447,
"learning_rate": 7.369343312364993e-07,
"logits/chosen": -0.6898236870765686,
"logits/rejected": -0.7303708791732788,
"logps/chosen": -365.74688720703125,
"logps/rejected": -406.60125732421875,
"loss": 0.5822,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3478041887283325,
"rewards/margins": 0.3217250108718872,
"rewards/rejected": -1.6695290803909302,
"step": 191
},
{
"epoch": 0.41042084168336673,
"grad_norm": 11.11112446563951,
"learning_rate": 7.33634314136531e-07,
"logits/chosen": -0.567010223865509,
"logits/rejected": -0.5823702812194824,
"logps/chosen": -351.3297119140625,
"logps/rejected": -352.91400146484375,
"loss": 0.6731,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.202454924583435,
"rewards/margins": 0.1386478990316391,
"rewards/rejected": -1.3411028385162354,
"step": 192
},
{
"epoch": 0.41255845023380094,
"grad_norm": 10.094586281846308,
"learning_rate": 7.303212252253161e-07,
"logits/chosen": -0.6867839694023132,
"logits/rejected": -0.631986677646637,
"logps/chosen": -446.3096008300781,
"logps/rejected": -481.1722717285156,
"loss": 0.5804,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.420650601387024,
"rewards/margins": 0.3154638409614563,
"rewards/rejected": -1.7361143827438354,
"step": 193
},
{
"epoch": 0.41469605878423516,
"grad_norm": 9.22745603420781,
"learning_rate": 7.269952498697734e-07,
"logits/chosen": -0.6122913360595703,
"logits/rejected": -0.5846338868141174,
"logps/chosen": -404.2279052734375,
"logps/rejected": -479.53546142578125,
"loss": 0.5926,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4300614595413208,
"rewards/margins": 0.682074785232544,
"rewards/rejected": -2.112136125564575,
"step": 194
},
{
"epoch": 0.4168336673346693,
"grad_norm": 8.922810963128454,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -0.7965989112854004,
"logits/rejected": -0.8105958104133606,
"logps/chosen": -412.70068359375,
"logps/rejected": -459.9786376953125,
"loss": 0.6098,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1933541297912598,
"rewards/margins": 0.4532015025615692,
"rewards/rejected": -1.6465556621551514,
"step": 195
},
{
"epoch": 0.4189712758851035,
"grad_norm": 9.70951721797377,
"learning_rate": 7.203053848879418e-07,
"logits/chosen": -0.66545569896698,
"logits/rejected": -0.6426224708557129,
"logps/chosen": -417.62750244140625,
"logps/rejected": -446.0390625,
"loss": 0.6345,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4516615867614746,
"rewards/margins": 0.234949991106987,
"rewards/rejected": -1.6866116523742676,
"step": 196
},
{
"epoch": 0.42110888443553773,
"grad_norm": 10.834268818449964,
"learning_rate": 7.16941869558779e-07,
"logits/chosen": -0.6952583193778992,
"logits/rejected": -0.6965677738189697,
"logps/chosen": -447.6587829589844,
"logps/rejected": -491.00872802734375,
"loss": 0.6368,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.6263762712478638,
"rewards/margins": 0.3232609033584595,
"rewards/rejected": -1.9496371746063232,
"step": 197
},
{
"epoch": 0.42324649298597194,
"grad_norm": 10.734655546374897,
"learning_rate": 7.135662163585984e-07,
"logits/chosen": -0.7219685316085815,
"logits/rejected": -0.7239058613777161,
"logps/chosen": -379.8273620605469,
"logps/rejected": -416.1621398925781,
"loss": 0.6473,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4015758037567139,
"rewards/margins": 0.279184490442276,
"rewards/rejected": -1.6807602643966675,
"step": 198
},
{
"epoch": 0.42538410153640616,
"grad_norm": 10.226763136881486,
"learning_rate": 7.101786141547828e-07,
"logits/chosen": -0.6653244495391846,
"logits/rejected": -0.6480982303619385,
"logps/chosen": -388.545166015625,
"logps/rejected": -400.65447998046875,
"loss": 0.6346,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3625783920288086,
"rewards/margins": 0.17150306701660156,
"rewards/rejected": -1.5340813398361206,
"step": 199
},
{
"epoch": 0.42752171008684037,
"grad_norm": 11.800159452188982,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -0.802920401096344,
"logits/rejected": -0.7953581213951111,
"logps/chosen": -469.48583984375,
"logps/rejected": -514.489501953125,
"loss": 0.6935,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7296805381774902,
"rewards/margins": 0.4207393229007721,
"rewards/rejected": -2.1504197120666504,
"step": 200
},
{
"epoch": 0.42752171008684037,
"eval_logits/chosen": -0.6628897190093994,
"eval_logits/rejected": -0.6649256348609924,
"eval_logps/chosen": -392.1436767578125,
"eval_logps/rejected": -424.3627624511719,
"eval_loss": 0.635185182094574,
"eval_rewards/accuracies": 0.6544715166091919,
"eval_rewards/chosen": -1.3659569025039673,
"eval_rewards/margins": 0.2651316225528717,
"eval_rewards/rejected": -1.6310884952545166,
"eval_runtime": 376.3857,
"eval_samples_per_second": 5.21,
"eval_steps_per_second": 0.327,
"step": 200
},
{
"epoch": 0.4296593186372745,
"grad_norm": 10.509050348979823,
"learning_rate": 7.033683215379002e-07,
"logits/chosen": -0.7490158081054688,
"logits/rejected": -0.7795702219009399,
"logps/chosen": -444.27264404296875,
"logps/rejected": -450.5096435546875,
"loss": 0.6259,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6222877502441406,
"rewards/margins": 0.0773380845785141,
"rewards/rejected": -1.699625849723816,
"step": 201
},
{
"epoch": 0.43179692718770873,
"grad_norm": 9.361779380994284,
"learning_rate": 6.999460121598704e-07,
"logits/chosen": -0.8867595195770264,
"logits/rejected": -0.8778724074363708,
"logps/chosen": -395.88262939453125,
"logps/rejected": -424.9254455566406,
"loss": 0.6199,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3475773334503174,
"rewards/margins": 0.2797107696533203,
"rewards/rejected": -1.6272879838943481,
"step": 202
},
{
"epoch": 0.43393453573814295,
"grad_norm": 10.465713404951545,
"learning_rate": 6.965125158269618e-07,
"logits/chosen": -0.7478022575378418,
"logits/rejected": -0.7213735580444336,
"logps/chosen": -375.4535217285156,
"logps/rejected": -400.4565734863281,
"loss": 0.6452,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3242757320404053,
"rewards/margins": 0.18743321299552917,
"rewards/rejected": -1.5117088556289673,
"step": 203
},
{
"epoch": 0.43607214428857716,
"grad_norm": 9.542403717436502,
"learning_rate": 6.93068024642873e-07,
"logits/chosen": -0.7434294819831848,
"logits/rejected": -0.7202074527740479,
"logps/chosen": -367.4134216308594,
"logps/rejected": -395.12396240234375,
"loss": 0.6408,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.321439266204834,
"rewards/margins": 0.3816969692707062,
"rewards/rejected": -1.7031362056732178,
"step": 204
},
{
"epoch": 0.43820975283901137,
"grad_norm": 11.318421364351005,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -0.6576538681983948,
"logits/rejected": -0.6434054374694824,
"logps/chosen": -381.1850280761719,
"logps/rejected": -385.73736572265625,
"loss": 0.6267,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.294406771659851,
"rewards/margins": 0.12336639314889908,
"rewards/rejected": -1.4177730083465576,
"step": 205
},
{
"epoch": 0.4403473613894456,
"grad_norm": 8.514452329680676,
"learning_rate": 6.861468292009726e-07,
"logits/chosen": -0.652076780796051,
"logits/rejected": -0.6382969617843628,
"logps/chosen": -392.5809326171875,
"logps/rejected": -430.596923828125,
"loss": 0.6304,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.44416344165802,
"rewards/margins": 0.37998878955841064,
"rewards/rejected": -1.8241522312164307,
"step": 206
},
{
"epoch": 0.44248496993987974,
"grad_norm": 9.679027742003948,
"learning_rate": 6.826705121831976e-07,
"logits/chosen": -0.7171617746353149,
"logits/rejected": -0.7156708240509033,
"logps/chosen": -378.35528564453125,
"logps/rejected": -411.4539489746094,
"loss": 0.6376,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3946869373321533,
"rewards/margins": 0.37037399411201477,
"rewards/rejected": -1.7650609016418457,
"step": 207
},
{
"epoch": 0.44462257849031395,
"grad_norm": 10.610288706227443,
"learning_rate": 6.7918397477265e-07,
"logits/chosen": -0.6665509939193726,
"logits/rejected": -0.6577183604240417,
"logps/chosen": -365.9376525878906,
"logps/rejected": -362.8846740722656,
"loss": 0.6653,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1479219198226929,
"rewards/margins": 0.01849663257598877,
"rewards/rejected": -1.1664186716079712,
"step": 208
},
{
"epoch": 0.44676018704074816,
"grad_norm": 9.112597710939323,
"learning_rate": 6.756874120406714e-07,
"logits/chosen": -0.6265541315078735,
"logits/rejected": -0.61783766746521,
"logps/chosen": -381.3807067871094,
"logps/rejected": -425.4930725097656,
"loss": 0.6119,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.280937671661377,
"rewards/margins": 0.3031711280345917,
"rewards/rejected": -1.584108829498291,
"step": 209
},
{
"epoch": 0.44889779559118237,
"grad_norm": 9.390736537982283,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -0.7654869556427002,
"logits/rejected": -0.7667275071144104,
"logps/chosen": -417.27069091796875,
"logps/rejected": -449.1034851074219,
"loss": 0.6217,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3596787452697754,
"rewards/margins": 0.3042774796485901,
"rewards/rejected": -1.6639561653137207,
"step": 210
},
{
"epoch": 0.4510354041416166,
"grad_norm": 9.394078254466548,
"learning_rate": 6.68664993691415e-07,
"logits/chosen": -0.6547084450721741,
"logits/rejected": -0.647241473197937,
"logps/chosen": -336.94915771484375,
"logps/rejected": -371.03515625,
"loss": 0.6312,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9777745604515076,
"rewards/margins": 0.2829311490058899,
"rewards/rejected": -1.2607057094573975,
"step": 211
},
{
"epoch": 0.4531730126920508,
"grad_norm": 8.760414806290829,
"learning_rate": 6.651395309775836e-07,
"logits/chosen": -0.6064110398292542,
"logits/rejected": -0.5819242000579834,
"logps/chosen": -353.7124938964844,
"logps/rejected": -384.0793151855469,
"loss": 0.5966,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1342616081237793,
"rewards/margins": 0.3782859742641449,
"rewards/rejected": -1.512547492980957,
"step": 212
},
{
"epoch": 0.45531062124248495,
"grad_norm": 9.331952485323354,
"learning_rate": 6.6160482872723e-07,
"logits/chosen": -0.6409544944763184,
"logits/rejected": -0.6478085517883301,
"logps/chosen": -374.2773742675781,
"logps/rejected": -397.2945861816406,
"loss": 0.6342,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2687841653823853,
"rewards/margins": 0.11245452612638474,
"rewards/rejected": -1.3812386989593506,
"step": 213
},
{
"epoch": 0.45744822979291916,
"grad_norm": 9.844190008748196,
"learning_rate": 6.580610847065123e-07,
"logits/chosen": -0.6078667640686035,
"logits/rejected": -0.60109543800354,
"logps/chosen": -357.74810791015625,
"logps/rejected": -391.06268310546875,
"loss": 0.614,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1224141120910645,
"rewards/margins": 0.23654705286026,
"rewards/rejected": -1.3589611053466797,
"step": 214
},
{
"epoch": 0.45958583834335337,
"grad_norm": 9.317047438854233,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -0.608707845211029,
"logits/rejected": -0.6254767775535583,
"logps/chosen": -340.4634094238281,
"logps/rejected": -377.37152099609375,
"loss": 0.655,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0063505172729492,
"rewards/margins": 0.29992133378982544,
"rewards/rejected": -1.3062719106674194,
"step": 215
},
{
"epoch": 0.4617234468937876,
"grad_norm": 9.52121536372048,
"learning_rate": 6.509472649369509e-07,
"logits/chosen": -0.642886221408844,
"logits/rejected": -0.6272940039634705,
"logps/chosen": -324.8238525390625,
"logps/rejected": -367.4193115234375,
"loss": 0.5939,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9792121052742004,
"rewards/margins": 0.3688339293003082,
"rewards/rejected": -1.3480459451675415,
"step": 216
},
{
"epoch": 0.4638610554442218,
"grad_norm": 10.890742309360663,
"learning_rate": 6.473775872054521e-07,
"logits/chosen": -0.6968441009521484,
"logits/rejected": -0.6998182535171509,
"logps/chosen": -425.0888977050781,
"logps/rejected": -457.17889404296875,
"loss": 0.6358,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.394580364227295,
"rewards/margins": 0.2801092267036438,
"rewards/rejected": -1.6746896505355835,
"step": 217
},
{
"epoch": 0.465998663994656,
"grad_norm": 8.973438938365845,
"learning_rate": 6.437996637160086e-07,
"logits/chosen": -0.6339977979660034,
"logits/rejected": -0.605747401714325,
"logps/chosen": -359.0996398925781,
"logps/rejected": -398.10284423828125,
"loss": 0.6284,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1331509351730347,
"rewards/margins": 0.29103800654411316,
"rewards/rejected": -1.4241892099380493,
"step": 218
},
{
"epoch": 0.46813627254509016,
"grad_norm": 9.024594454350343,
"learning_rate": 6.402136946530014e-07,
"logits/chosen": -0.6726840734481812,
"logits/rejected": -0.6727656722068787,
"logps/chosen": -411.4464111328125,
"logps/rejected": -438.1039733886719,
"loss": 0.6074,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.137448787689209,
"rewards/margins": 0.2949088513851166,
"rewards/rejected": -1.4323575496673584,
"step": 219
},
{
"epoch": 0.47027388109552437,
"grad_norm": 9.318954884923675,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -0.5828653573989868,
"logits/rejected": -0.5879778861999512,
"logps/chosen": -416.6065673828125,
"logps/rejected": -447.0008544921875,
"loss": 0.6169,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4899258613586426,
"rewards/margins": 0.2851335406303406,
"rewards/rejected": -1.7750593423843384,
"step": 220
},
{
"epoch": 0.4724114896459586,
"grad_norm": 10.91663306855251,
"learning_rate": 6.330184227833375e-07,
"logits/chosen": -0.6656166315078735,
"logits/rejected": -0.654589056968689,
"logps/chosen": -380.12811279296875,
"logps/rejected": -417.3984069824219,
"loss": 0.5782,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2443169355392456,
"rewards/margins": 0.4823899269104004,
"rewards/rejected": -1.726706862449646,
"step": 221
},
{
"epoch": 0.4745490981963928,
"grad_norm": 9.66304655835996,
"learning_rate": 6.294095225512604e-07,
"logits/chosen": -0.6804403066635132,
"logits/rejected": -0.6730751395225525,
"logps/chosen": -391.51995849609375,
"logps/rejected": -434.95928955078125,
"loss": 0.6007,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.164958119392395,
"rewards/margins": 0.4750928580760956,
"rewards/rejected": -1.640051007270813,
"step": 222
},
{
"epoch": 0.476686706746827,
"grad_norm": 11.307540321918372,
"learning_rate": 6.257933818722542e-07,
"logits/chosen": -0.6279383301734924,
"logits/rejected": -0.6163449883460999,
"logps/chosen": -376.4117736816406,
"logps/rejected": -397.33917236328125,
"loss": 0.6871,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.320064663887024,
"rewards/margins": 0.12475023418664932,
"rewards/rejected": -1.444814920425415,
"step": 223
},
{
"epoch": 0.4788243152972612,
"grad_norm": 9.673767465041793,
"learning_rate": 6.22170203068947e-07,
"logits/chosen": -0.711574912071228,
"logits/rejected": -0.6971991062164307,
"logps/chosen": -370.3948059082031,
"logps/rejected": -394.70379638671875,
"loss": 0.594,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2323672771453857,
"rewards/margins": 0.2632126808166504,
"rewards/rejected": -1.4955798387527466,
"step": 224
},
{
"epoch": 0.48096192384769537,
"grad_norm": 14.301565196390225,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -0.7201038599014282,
"logits/rejected": -0.713502049446106,
"logps/chosen": -411.15997314453125,
"logps/rejected": -440.4834289550781,
"loss": 0.6286,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4549378156661987,
"rewards/margins": 0.26348626613616943,
"rewards/rejected": -1.7184242010116577,
"step": 225
},
{
"epoch": 0.4830995323981296,
"grad_norm": 10.44824838559519,
"learning_rate": 6.149035423375098e-07,
"logits/chosen": -0.7044095993041992,
"logits/rejected": -0.7011440992355347,
"logps/chosen": -394.7225341796875,
"logps/rejected": -418.9303283691406,
"loss": 0.6385,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2195916175842285,
"rewards/margins": 0.20536328852176666,
"rewards/rejected": -1.424954891204834,
"step": 226
},
{
"epoch": 0.4852371409485638,
"grad_norm": 11.00631388790137,
"learning_rate": 6.112604669781572e-07,
"logits/chosen": -0.735901951789856,
"logits/rejected": -0.6977694034576416,
"logps/chosen": -438.9553527832031,
"logps/rejected": -447.6878662109375,
"loss": 0.6141,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.478360891342163,
"rewards/margins": 0.119898721575737,
"rewards/rejected": -1.5982595682144165,
"step": 227
},
{
"epoch": 0.487374749498998,
"grad_norm": 10.507160088155747,
"learning_rate": 6.07611166609311e-07,
"logits/chosen": -0.7429340481758118,
"logits/rejected": -0.7295467257499695,
"logps/chosen": -430.9995422363281,
"logps/rejected": -448.747314453125,
"loss": 0.6533,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4200453758239746,
"rewards/margins": 0.17433959245681763,
"rewards/rejected": -1.594385027885437,
"step": 228
},
{
"epoch": 0.4895123580494322,
"grad_norm": 8.307584591306474,
"learning_rate": 6.039558454088795e-07,
"logits/chosen": -0.6406713128089905,
"logits/rejected": -0.6399562358856201,
"logps/chosen": -332.7983703613281,
"logps/rejected": -353.19384765625,
"loss": 0.5913,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1042307615280151,
"rewards/margins": 0.19572903215885162,
"rewards/rejected": -1.2999597787857056,
"step": 229
},
{
"epoch": 0.4916499665998664,
"grad_norm": 11.994876856372567,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -0.6426191926002502,
"logits/rejected": -0.6602756977081299,
"logps/chosen": -344.1719665527344,
"logps/rejected": -356.783935546875,
"loss": 0.69,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2262554168701172,
"rewards/margins": 0.10306321084499359,
"rewards/rejected": -1.329318642616272,
"step": 230
},
{
"epoch": 0.4937875751503006,
"grad_norm": 10.289938408873015,
"learning_rate": 5.966279588977766e-07,
"logits/chosen": -0.7598620653152466,
"logits/rejected": -0.7735162377357483,
"logps/chosen": -382.27630615234375,
"logps/rejected": -393.7611083984375,
"loss": 0.6243,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.1877317428588867,
"rewards/margins": 0.17938965559005737,
"rewards/rejected": -1.3671213388442993,
"step": 231
},
{
"epoch": 0.4959251837007348,
"grad_norm": 10.32285872025184,
"learning_rate": 5.929558035814574e-07,
"logits/chosen": -0.5800771713256836,
"logits/rejected": -0.5892568826675415,
"logps/chosen": -364.911376953125,
"logps/rejected": -363.5468444824219,
"loss": 0.6196,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3333511352539062,
"rewards/margins": 0.1056426540017128,
"rewards/rejected": -1.4389936923980713,
"step": 232
},
{
"epoch": 0.498062792251169,
"grad_norm": 10.3934909690253,
"learning_rate": 5.892784473993183e-07,
"logits/chosen": -0.6197159290313721,
"logits/rejected": -0.6411285400390625,
"logps/chosen": -372.03424072265625,
"logps/rejected": -401.731201171875,
"loss": 0.5626,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.235286831855774,
"rewards/margins": 0.3155067563056946,
"rewards/rejected": -1.5507938861846924,
"step": 233
},
{
"epoch": 0.5002004008016032,
"grad_norm": 10.257979100996899,
"learning_rate": 5.855960960989876e-07,
"logits/chosen": -0.7090120911598206,
"logits/rejected": -0.6980421543121338,
"logps/chosen": -328.9789123535156,
"logps/rejected": -349.19036865234375,
"loss": 0.6148,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2482969760894775,
"rewards/margins": 0.14759615063667297,
"rewards/rejected": -1.3958930969238281,
"step": 234
},
{
"epoch": 0.5023380093520374,
"grad_norm": 8.317098893301642,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -0.7996770739555359,
"logits/rejected": -0.7929503917694092,
"logps/chosen": -331.7429504394531,
"logps/rejected": -369.83746337890625,
"loss": 0.6066,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.9831193089485168,
"rewards/margins": 0.4169498383998871,
"rewards/rejected": -1.400068998336792,
"step": 235
},
{
"epoch": 0.5044756179024716,
"grad_norm": 10.217065084123991,
"learning_rate": 5.782172325201155e-07,
"logits/chosen": -0.6208564043045044,
"logits/rejected": -0.635725200176239,
"logps/chosen": -344.1796875,
"logps/rejected": -389.52923583984375,
"loss": 0.6368,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1277637481689453,
"rewards/margins": 0.37036919593811035,
"rewards/rejected": -1.4981330633163452,
"step": 236
},
{
"epoch": 0.5066132264529059,
"grad_norm": 10.269160973282458,
"learning_rate": 5.745211330880872e-07,
"logits/chosen": -0.7708931565284729,
"logits/rejected": -0.76704341173172,
"logps/chosen": -433.10064697265625,
"logps/rejected": -450.0901184082031,
"loss": 0.6314,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3207013607025146,
"rewards/margins": 0.2793017327785492,
"rewards/rejected": -1.6000031232833862,
"step": 237
},
{
"epoch": 0.5087508350033401,
"grad_norm": 10.399405697134318,
"learning_rate": 5.708208642077945e-07,
"logits/chosen": -0.6624871492385864,
"logits/rejected": -0.6546816825866699,
"logps/chosen": -333.7100524902344,
"logps/rejected": -369.9173278808594,
"loss": 0.6294,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1885484457015991,
"rewards/margins": 0.3089646100997925,
"rewards/rejected": -1.4975128173828125,
"step": 238
},
{
"epoch": 0.5108884435537742,
"grad_norm": 9.80964346078248,
"learning_rate": 5.671166329088277e-07,
"logits/chosen": -0.7182386517524719,
"logits/rejected": -0.7258840203285217,
"logps/chosen": -356.9850769042969,
"logps/rejected": -380.86285400390625,
"loss": 0.6235,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3426018953323364,
"rewards/margins": 0.24537137150764465,
"rewards/rejected": -1.5879731178283691,
"step": 239
},
{
"epoch": 0.5130260521042084,
"grad_norm": 10.018454517912645,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -0.6738543510437012,
"logits/rejected": -0.6593906283378601,
"logps/chosen": -359.072021484375,
"logps/rejected": -400.2255554199219,
"loss": 0.6674,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3493218421936035,
"rewards/margins": 0.24295195937156677,
"rewards/rejected": -1.5922737121582031,
"step": 240
},
{
"epoch": 0.5151636606546426,
"grad_norm": 9.883809912496002,
"learning_rate": 5.596971122701221e-07,
"logits/chosen": -0.8064689636230469,
"logits/rejected": -0.777323305606842,
"logps/chosen": -383.2707214355469,
"logps/rejected": -394.1632995605469,
"loss": 0.6133,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2935426235198975,
"rewards/margins": 0.17417016625404358,
"rewards/rejected": -1.4677127599716187,
"step": 241
},
{
"epoch": 0.5173012692050768,
"grad_norm": 9.835505308989157,
"learning_rate": 5.559822380516539e-07,
"logits/chosen": -0.74181067943573,
"logits/rejected": -0.76103276014328,
"logps/chosen": -413.28607177734375,
"logps/rejected": -432.421142578125,
"loss": 0.6135,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6156772375106812,
"rewards/margins": 0.052678730338811874,
"rewards/rejected": -1.668355941772461,
"step": 242
},
{
"epoch": 0.519438877755511,
"grad_norm": 10.044057697511136,
"learning_rate": 5.522642316338268e-07,
"logits/chosen": -0.7109071016311646,
"logits/rejected": -0.738073468208313,
"logps/chosen": -371.0440673828125,
"logps/rejected": -417.84588623046875,
"loss": 0.6108,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2593053579330444,
"rewards/margins": 0.29793858528137207,
"rewards/rejected": -1.557244062423706,
"step": 243
},
{
"epoch": 0.5215764863059452,
"grad_norm": 9.78032272573038,
"learning_rate": 5.48543301038644e-07,
"logits/chosen": -0.8035364747047424,
"logits/rejected": -0.817245364189148,
"logps/chosen": -408.1662292480469,
"logps/rejected": -431.6890869140625,
"loss": 0.657,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3148820400238037,
"rewards/margins": 0.27137643098831177,
"rewards/rejected": -1.5862585306167603,
"step": 244
},
{
"epoch": 0.5237140948563794,
"grad_norm": 11.286066879084709,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -0.8000929355621338,
"logits/rejected": -0.7960721254348755,
"logps/chosen": -348.828369140625,
"logps/rejected": -370.3882751464844,
"loss": 0.6377,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3299049139022827,
"rewards/margins": 0.21843267977237701,
"rewards/rejected": -1.5483375787734985,
"step": 245
},
{
"epoch": 0.5258517034068136,
"grad_norm": 9.544361896918447,
"learning_rate": 5.410935002106152e-07,
"logits/chosen": -0.7660020589828491,
"logits/rejected": -0.7475563883781433,
"logps/chosen": -402.910400390625,
"logps/rejected": -406.4446105957031,
"loss": 0.6237,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.283268690109253,
"rewards/margins": 0.24972115457057953,
"rewards/rejected": -1.5329898595809937,
"step": 246
},
{
"epoch": 0.5279893119572479,
"grad_norm": 9.1575605917451,
"learning_rate": 5.373650467932121e-07,
"logits/chosen": -0.741169273853302,
"logits/rejected": -0.7101236581802368,
"logps/chosen": -353.3587951660156,
"logps/rejected": -394.3509521484375,
"loss": 0.5927,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2977474927902222,
"rewards/margins": 0.445009708404541,
"rewards/rejected": -1.7427570819854736,
"step": 247
},
{
"epoch": 0.5301269205076821,
"grad_norm": 10.605879178884328,
"learning_rate": 5.336345028060199e-07,
"logits/chosen": -0.735455334186554,
"logits/rejected": -0.7146904468536377,
"logps/chosen": -415.8868103027344,
"logps/rejected": -471.6510925292969,
"loss": 0.6285,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4680719375610352,
"rewards/margins": 0.37900200486183167,
"rewards/rejected": -1.8470739126205444,
"step": 248
},
{
"epoch": 0.5322645290581163,
"grad_norm": 9.468418276322426,
"learning_rate": 5.299020769725171e-07,
"logits/chosen": -0.6703728437423706,
"logits/rejected": -0.6554571986198425,
"logps/chosen": -414.7881164550781,
"logps/rejected": -444.9212646484375,
"loss": 0.6305,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5948419570922852,
"rewards/margins": 0.24465849995613098,
"rewards/rejected": -1.8395004272460938,
"step": 249
},
{
"epoch": 0.5344021376085505,
"grad_norm": 10.105402066971774,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -0.6142255663871765,
"logits/rejected": -0.5848169922828674,
"logps/chosen": -390.851806640625,
"logps/rejected": -430.1876220703125,
"loss": 0.6119,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5544801950454712,
"rewards/margins": 0.30869632959365845,
"rewards/rejected": -1.8631765842437744,
"step": 250
},
{
"epoch": 0.5365397461589846,
"grad_norm": 9.810217626805535,
"learning_rate": 5.224324151752575e-07,
"logits/chosen": -0.6183363795280457,
"logits/rejected": -0.6150676608085632,
"logps/chosen": -367.5179443359375,
"logps/rejected": -416.97332763671875,
"loss": 0.6083,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2182056903839111,
"rewards/margins": 0.5702115893363953,
"rewards/rejected": -1.7884173393249512,
"step": 251
},
{
"epoch": 0.5386773547094188,
"grad_norm": 11.63997005510131,
"learning_rate": 5.18695597138163e-07,
"logits/chosen": -0.7786095142364502,
"logits/rejected": -0.7649445533752441,
"logps/chosen": -406.415771484375,
"logps/rejected": -438.0752868652344,
"loss": 0.6444,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4087908267974854,
"rewards/margins": 0.3740866780281067,
"rewards/rejected": -1.7828774452209473,
"step": 252
},
{
"epoch": 0.540814963259853,
"grad_norm": 8.971335597918381,
"learning_rate": 5.149577330846992e-07,
"logits/chosen": -0.722287118434906,
"logits/rejected": -0.7298377752304077,
"logps/chosen": -385.11602783203125,
"logps/rejected": -462.1266784667969,
"loss": 0.5901,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3030569553375244,
"rewards/margins": 0.4734255373477936,
"rewards/rejected": -1.7764827013015747,
"step": 253
},
{
"epoch": 0.5429525718102872,
"grad_norm": 10.226248494849832,
"learning_rate": 5.112190321479025e-07,
"logits/chosen": -0.7946709990501404,
"logits/rejected": -0.7953794598579407,
"logps/chosen": -365.50604248046875,
"logps/rejected": -393.0849914550781,
"loss": 0.6099,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3220914602279663,
"rewards/margins": 0.16996119916439056,
"rewards/rejected": -1.492052674293518,
"step": 254
},
{
"epoch": 0.5450901803607214,
"grad_norm": 9.70395445393845,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -0.8279726505279541,
"logits/rejected": -0.8029213547706604,
"logps/chosen": -363.36956787109375,
"logps/rejected": -353.39300537109375,
"loss": 0.5921,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.4433786869049072,
"rewards/margins": 0.08160518109798431,
"rewards/rejected": -1.5249840021133423,
"step": 255
},
{
"epoch": 0.5472277889111556,
"grad_norm": 10.09145149251664,
"learning_rate": 5.037399563788664e-07,
"logits/chosen": -0.6333373785018921,
"logits/rejected": -0.6277045011520386,
"logps/chosen": -363.3057861328125,
"logps/rejected": -414.02874755859375,
"loss": 0.5775,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2467057704925537,
"rewards/margins": 0.4376518726348877,
"rewards/rejected": -1.6843575239181519,
"step": 256
},
{
"epoch": 0.5493653974615899,
"grad_norm": 10.227202697175395,
"learning_rate": 5e-07,
"logits/chosen": -0.7193889021873474,
"logits/rejected": -0.7205474376678467,
"logps/chosen": -384.7895812988281,
"logps/rejected": -409.3086242675781,
"loss": 0.6474,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4086096286773682,
"rewards/margins": 0.14859981834888458,
"rewards/rejected": -1.5572093725204468,
"step": 257
},
{
"epoch": 0.5515030060120241,
"grad_norm": 10.455321411037115,
"learning_rate": 4.962600436211335e-07,
"logits/chosen": -0.7665015459060669,
"logits/rejected": -0.751805305480957,
"logps/chosen": -353.6752624511719,
"logps/rejected": -387.00494384765625,
"loss": 0.6357,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2000305652618408,
"rewards/margins": 0.3206770420074463,
"rewards/rejected": -1.5207074880599976,
"step": 258
},
{
"epoch": 0.5536406145624583,
"grad_norm": 10.67800131716867,
"learning_rate": 4.925202964923683e-07,
"logits/chosen": -0.67658931016922,
"logits/rejected": -0.6737143397331238,
"logps/chosen": -357.2424011230469,
"logps/rejected": -380.38238525390625,
"loss": 0.5989,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1178100109100342,
"rewards/margins": 0.21349495649337769,
"rewards/rejected": -1.3313050270080566,
"step": 259
},
{
"epoch": 0.5557782231128925,
"grad_norm": 10.755754910243839,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -0.7121912240982056,
"logits/rejected": -0.6941719055175781,
"logps/chosen": -311.152587890625,
"logps/rejected": -341.55682373046875,
"loss": 0.579,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0979379415512085,
"rewards/margins": 0.32470834255218506,
"rewards/rejected": -1.4226462841033936,
"step": 260
},
{
"epoch": 0.5579158316633267,
"grad_norm": 10.029694987313983,
"learning_rate": 4.850422669153009e-07,
"logits/chosen": -0.7704156041145325,
"logits/rejected": -0.7731869220733643,
"logps/chosen": -433.1751403808594,
"logps/rejected": -475.2624206542969,
"loss": 0.6159,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5628148317337036,
"rewards/margins": 0.27969640493392944,
"rewards/rejected": -1.842511534690857,
"step": 261
},
{
"epoch": 0.5600534402137608,
"grad_norm": 9.985077304371496,
"learning_rate": 4.813044028618372e-07,
"logits/chosen": -0.655546247959137,
"logits/rejected": -0.5991637110710144,
"logps/chosen": -311.8508605957031,
"logps/rejected": -352.53912353515625,
"loss": 0.6149,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1894161701202393,
"rewards/margins": 0.4629126489162445,
"rewards/rejected": -1.6523289680480957,
"step": 262
},
{
"epoch": 0.562191048764195,
"grad_norm": 11.42063256185086,
"learning_rate": 4.775675848247427e-07,
"logits/chosen": -0.7124533653259277,
"logits/rejected": -0.7007814645767212,
"logps/chosen": -349.7750549316406,
"logps/rejected": -395.9293518066406,
"loss": 0.606,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2669329643249512,
"rewards/margins": 0.35066768527030945,
"rewards/rejected": -1.617600679397583,
"step": 263
},
{
"epoch": 0.5643286573146292,
"grad_norm": 10.025460802753425,
"learning_rate": 4.7383202187852804e-07,
"logits/chosen": -0.6652883887290955,
"logits/rejected": -0.6626304388046265,
"logps/chosen": -350.379150390625,
"logps/rejected": -388.03955078125,
"loss": 0.5997,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4017623662948608,
"rewards/margins": 0.309231698513031,
"rewards/rejected": -1.710994005203247,
"step": 264
},
{
"epoch": 0.5664662658650634,
"grad_norm": 10.653930828086093,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -0.7248793244361877,
"logits/rejected": -0.750960648059845,
"logps/chosen": -394.911865234375,
"logps/rejected": -443.53643798828125,
"loss": 0.6179,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.381069540977478,
"rewards/margins": 0.36748361587524414,
"rewards/rejected": -1.7485532760620117,
"step": 265
},
{
"epoch": 0.5686038744154976,
"grad_norm": 9.87511145308802,
"learning_rate": 4.6636549719398016e-07,
"logits/chosen": -0.7590113878250122,
"logits/rejected": -0.7530328035354614,
"logps/chosen": -422.1754150390625,
"logps/rejected": -463.882080078125,
"loss": 0.5906,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4037176370620728,
"rewards/margins": 0.3301146626472473,
"rewards/rejected": -1.7338322401046753,
"step": 266
},
{
"epoch": 0.5707414829659319,
"grad_norm": 10.40705399730886,
"learning_rate": 4.626349532067879e-07,
"logits/chosen": -0.5113621950149536,
"logits/rejected": -0.4636048972606659,
"logps/chosen": -402.9996337890625,
"logps/rejected": -432.8628845214844,
"loss": 0.6439,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4809210300445557,
"rewards/margins": 0.3719174861907959,
"rewards/rejected": -1.8528385162353516,
"step": 267
},
{
"epoch": 0.5728790915163661,
"grad_norm": 9.883415102033252,
"learning_rate": 4.5890649978938487e-07,
"logits/chosen": -0.7086624503135681,
"logits/rejected": -0.6735981702804565,
"logps/chosen": -396.3412170410156,
"logps/rejected": -393.7690734863281,
"loss": 0.5721,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.459097146987915,
"rewards/margins": 0.10202471911907196,
"rewards/rejected": -1.561121940612793,
"step": 268
},
{
"epoch": 0.5750167000668003,
"grad_norm": 11.674476580668774,
"learning_rate": 4.5518034554828327e-07,
"logits/chosen": -0.7449507117271423,
"logits/rejected": -0.722856879234314,
"logps/chosen": -426.8054504394531,
"logps/rejected": -444.621337890625,
"loss": 0.6042,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3702492713928223,
"rewards/margins": 0.2847437858581543,
"rewards/rejected": -1.6549930572509766,
"step": 269
},
{
"epoch": 0.5771543086172345,
"grad_norm": 10.79085847307073,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -0.7816205024719238,
"logits/rejected": -0.7831264734268188,
"logps/chosen": -380.7841796875,
"logps/rejected": -405.67108154296875,
"loss": 0.6769,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4601352214813232,
"rewards/margins": 0.22311216592788696,
"rewards/rejected": -1.6832473278045654,
"step": 270
},
{
"epoch": 0.5792919171676687,
"grad_norm": 13.754123888989874,
"learning_rate": 4.477357683661733e-07,
"logits/chosen": -0.6621173620223999,
"logits/rejected": -0.6234359741210938,
"logps/chosen": -376.8826599121094,
"logps/rejected": -421.74981689453125,
"loss": 0.6674,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3359074592590332,
"rewards/margins": 0.37606099247932434,
"rewards/rejected": -1.7119684219360352,
"step": 271
},
{
"epoch": 0.5814295257181029,
"grad_norm": 10.684496603394274,
"learning_rate": 4.4401776194834603e-07,
"logits/chosen": -0.7525122761726379,
"logits/rejected": -0.6963589787483215,
"logps/chosen": -329.6082458496094,
"logps/rejected": -376.0239562988281,
"loss": 0.6287,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1713266372680664,
"rewards/margins": 0.28129494190216064,
"rewards/rejected": -1.452621579170227,
"step": 272
},
{
"epoch": 0.5835671342685371,
"grad_norm": 9.352339379386558,
"learning_rate": 4.403028877298779e-07,
"logits/chosen": -0.6548051238059998,
"logits/rejected": -0.632011890411377,
"logps/chosen": -384.2966003417969,
"logps/rejected": -421.78839111328125,
"loss": 0.622,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.369025707244873,
"rewards/margins": 0.33843424916267395,
"rewards/rejected": -1.707459807395935,
"step": 273
},
{
"epoch": 0.5857047428189712,
"grad_norm": 9.078233343454654,
"learning_rate": 4.3659135355752593e-07,
"logits/chosen": -0.6783146858215332,
"logits/rejected": -0.6960130929946899,
"logps/chosen": -353.8924560546875,
"logps/rejected": -399.84918212890625,
"loss": 0.5956,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3400187492370605,
"rewards/margins": 0.32464563846588135,
"rewards/rejected": -1.664664387702942,
"step": 274
},
{
"epoch": 0.5878423513694054,
"grad_norm": 15.480458345502447,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -0.651485025882721,
"logits/rejected": -0.6426280736923218,
"logps/chosen": -407.6331481933594,
"logps/rejected": -407.97271728515625,
"loss": 0.6394,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.497727394104004,
"rewards/margins": 0.0166710764169693,
"rewards/rejected": -1.514398455619812,
"step": 275
},
{
"epoch": 0.5899799599198396,
"grad_norm": 9.317803722726895,
"learning_rate": 4.2917913579220553e-07,
"logits/chosen": -0.7354484796524048,
"logits/rejected": -0.7279876470565796,
"logps/chosen": -336.7724914550781,
"logps/rejected": -337.7842712402344,
"loss": 0.6297,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3154979944229126,
"rewards/margins": 0.18782049417495728,
"rewards/rejected": -1.5033185482025146,
"step": 276
},
{
"epoch": 0.5921175684702739,
"grad_norm": 10.981393496040226,
"learning_rate": 4.254788669119127e-07,
"logits/chosen": -0.6517477631568909,
"logits/rejected": -0.6439751386642456,
"logps/chosen": -398.1854553222656,
"logps/rejected": -394.73992919921875,
"loss": 0.6151,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5254887342453003,
"rewards/margins": 0.09052658081054688,
"rewards/rejected": -1.6160151958465576,
"step": 277
},
{
"epoch": 0.5942551770207081,
"grad_norm": 9.363953745062531,
"learning_rate": 4.2178276747988444e-07,
"logits/chosen": -0.7151267528533936,
"logits/rejected": -0.6989988088607788,
"logps/chosen": -399.211669921875,
"logps/rejected": -472.8134765625,
"loss": 0.6081,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5025238990783691,
"rewards/margins": 0.6458090543746948,
"rewards/rejected": -2.1483330726623535,
"step": 278
},
{
"epoch": 0.5963927855711423,
"grad_norm": 10.996768453375289,
"learning_rate": 4.180910442924311e-07,
"logits/chosen": -0.6743846535682678,
"logits/rejected": -0.6869890093803406,
"logps/chosen": -349.3891296386719,
"logps/rejected": -385.591064453125,
"loss": 0.6559,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1990691423416138,
"rewards/margins": 0.24665698409080505,
"rewards/rejected": -1.4457261562347412,
"step": 279
},
{
"epoch": 0.5985303941215765,
"grad_norm": 12.076546659241137,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -0.7634164094924927,
"logits/rejected": -0.7913932204246521,
"logps/chosen": -363.1887512207031,
"logps/rejected": -416.5007019042969,
"loss": 0.599,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0668609142303467,
"rewards/margins": 0.44077447056770325,
"rewards/rejected": -1.507635474205017,
"step": 280
},
{
"epoch": 0.6006680026720107,
"grad_norm": 11.14137973799164,
"learning_rate": 4.107215526006817e-07,
"logits/chosen": -0.7002226114273071,
"logits/rejected": -0.7134915590286255,
"logps/chosen": -370.8570556640625,
"logps/rejected": -408.5722961425781,
"loss": 0.664,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3522167205810547,
"rewards/margins": 0.2257714569568634,
"rewards/rejected": -1.5779881477355957,
"step": 281
},
{
"epoch": 0.6028056112224449,
"grad_norm": 10.43225088057876,
"learning_rate": 4.070441964185427e-07,
"logits/chosen": -0.6937713623046875,
"logits/rejected": -0.6445334553718567,
"logps/chosen": -320.21636962890625,
"logps/rejected": -386.27337646484375,
"loss": 0.6365,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0563011169433594,
"rewards/margins": 0.5464246869087219,
"rewards/rejected": -1.6027257442474365,
"step": 282
},
{
"epoch": 0.6049432197728791,
"grad_norm": 9.579570581028333,
"learning_rate": 4.0337204110222347e-07,
"logits/chosen": -0.7348592281341553,
"logits/rejected": -0.7190099954605103,
"logps/chosen": -368.09918212890625,
"logps/rejected": -410.2267761230469,
"loss": 0.6029,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1717023849487305,
"rewards/margins": 0.4082415699958801,
"rewards/rejected": -1.5799440145492554,
"step": 283
},
{
"epoch": 0.6070808283233133,
"grad_norm": 10.220765953116597,
"learning_rate": 3.997052921083636e-07,
"logits/chosen": -0.6168830394744873,
"logits/rejected": -0.6260079145431519,
"logps/chosen": -374.4775695800781,
"logps/rejected": -405.28546142578125,
"loss": 0.6056,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2549736499786377,
"rewards/margins": 0.39645275473594666,
"rewards/rejected": -1.6514263153076172,
"step": 284
},
{
"epoch": 0.6092184368737475,
"grad_norm": 11.754142515548534,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -0.7724018096923828,
"logits/rejected": -0.8003143668174744,
"logps/chosen": -411.28765869140625,
"logps/rejected": -446.0718688964844,
"loss": 0.6254,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3750414848327637,
"rewards/margins": 0.19460612535476685,
"rewards/rejected": -1.5696475505828857,
"step": 285
},
{
"epoch": 0.6113560454241816,
"grad_norm": 11.71195693335506,
"learning_rate": 3.92388833390689e-07,
"logits/chosen": -0.6072220206260681,
"logits/rejected": -0.5882732272148132,
"logps/chosen": -362.8934020996094,
"logps/rejected": -384.33685302734375,
"loss": 0.6421,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4378963708877563,
"rewards/margins": 0.3072332739830017,
"rewards/rejected": -1.7451298236846924,
"step": 286
},
{
"epoch": 0.6134936539746159,
"grad_norm": 10.423314767059496,
"learning_rate": 3.8873953302184283e-07,
"logits/chosen": -0.6478594541549683,
"logits/rejected": -0.6148996949195862,
"logps/chosen": -402.24993896484375,
"logps/rejected": -419.41510009765625,
"loss": 0.6184,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.573075532913208,
"rewards/margins": 0.27030453085899353,
"rewards/rejected": -1.8433799743652344,
"step": 287
},
{
"epoch": 0.6156312625250501,
"grad_norm": 11.589829757981947,
"learning_rate": 3.8509645766249034e-07,
"logits/chosen": -0.7512708902359009,
"logits/rejected": -0.7593178749084473,
"logps/chosen": -430.9858093261719,
"logps/rejected": -473.86578369140625,
"loss": 0.6066,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5662776231765747,
"rewards/margins": 0.3962094187736511,
"rewards/rejected": -1.9624871015548706,
"step": 288
},
{
"epoch": 0.6177688710754843,
"grad_norm": 38.79630588602357,
"learning_rate": 3.814598111422513e-07,
"logits/chosen": -0.7107813358306885,
"logits/rejected": -0.7043961882591248,
"logps/chosen": -359.62713623046875,
"logps/rejected": -373.19805908203125,
"loss": 0.6213,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.3326873779296875,
"rewards/margins": 0.10174018889665604,
"rewards/rejected": -1.4344274997711182,
"step": 289
},
{
"epoch": 0.6199064796259185,
"grad_norm": 9.198178073702046,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -0.7122032046318054,
"logits/rejected": -0.7226367592811584,
"logps/chosen": -360.8456726074219,
"logps/rejected": -395.7742614746094,
"loss": 0.6057,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3359217643737793,
"rewards/margins": 0.3005616068840027,
"rewards/rejected": -1.6364833116531372,
"step": 290
},
{
"epoch": 0.6220440881763527,
"grad_norm": 11.585726416356431,
"learning_rate": 3.742066181277457e-07,
"logits/chosen": -0.6904798150062561,
"logits/rejected": -0.6984922885894775,
"logps/chosen": -385.777587890625,
"logps/rejected": -417.1224670410156,
"loss": 0.602,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4644272327423096,
"rewards/margins": 0.26233839988708496,
"rewards/rejected": -1.7267656326293945,
"step": 291
},
{
"epoch": 0.6241816967267869,
"grad_norm": 10.98477533754328,
"learning_rate": 3.7059047744873955e-07,
"logits/chosen": -0.6717097759246826,
"logits/rejected": -0.6137974262237549,
"logps/chosen": -388.8412780761719,
"logps/rejected": -408.81964111328125,
"loss": 0.6424,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4904993772506714,
"rewards/margins": 0.2106323540210724,
"rewards/rejected": -1.701131820678711,
"step": 292
},
{
"epoch": 0.6263193052772211,
"grad_norm": 11.098357379348672,
"learning_rate": 3.669815772166625e-07,
"logits/chosen": -0.7643608450889587,
"logits/rejected": -0.7616855502128601,
"logps/chosen": -399.7235412597656,
"logps/rejected": -444.3169250488281,
"loss": 0.5882,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1846121549606323,
"rewards/margins": 0.3367740213871002,
"rewards/rejected": -1.5213862657546997,
"step": 293
},
{
"epoch": 0.6284569138276553,
"grad_norm": 9.823709075894158,
"learning_rate": 3.6338011934904e-07,
"logits/chosen": -0.7340261936187744,
"logits/rejected": -0.7253273129463196,
"logps/chosen": -415.0310974121094,
"logps/rejected": -479.2626953125,
"loss": 0.5839,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4078618288040161,
"rewards/margins": 0.7003488540649414,
"rewards/rejected": -2.108210802078247,
"step": 294
},
{
"epoch": 0.6305945223780896,
"grad_norm": 10.87697682105901,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -0.6595284342765808,
"logits/rejected": -0.6863126754760742,
"logps/chosen": -365.39013671875,
"logps/rejected": -408.86541748046875,
"loss": 0.652,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.497340202331543,
"rewards/margins": 0.30657070875167847,
"rewards/rejected": -1.803910732269287,
"step": 295
},
{
"epoch": 0.6327321309285238,
"grad_norm": 11.045318981796374,
"learning_rate": 3.562003362839914e-07,
"logits/chosen": -0.7206366062164307,
"logits/rejected": -0.7295577526092529,
"logps/chosen": -461.62225341796875,
"logps/rejected": -458.39703369140625,
"loss": 0.6665,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.7048362493515015,
"rewards/margins": 0.04338730126619339,
"rewards/rejected": -1.7482235431671143,
"step": 296
},
{
"epoch": 0.6348697394789579,
"grad_norm": 10.855323994177997,
"learning_rate": 3.526224127945478e-07,
"logits/chosen": -0.6919922828674316,
"logits/rejected": -0.6954550743103027,
"logps/chosen": -336.01556396484375,
"logps/rejected": -376.9835205078125,
"loss": 0.6407,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3392897844314575,
"rewards/margins": 0.2275466024875641,
"rewards/rejected": -1.5668363571166992,
"step": 297
},
{
"epoch": 0.6370073480293921,
"grad_norm": 9.58788965705626,
"learning_rate": 3.49052735063049e-07,
"logits/chosen": -0.8874866962432861,
"logits/rejected": -0.8917239904403687,
"logps/chosen": -403.54693603515625,
"logps/rejected": -442.32305908203125,
"loss": 0.5722,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.3754301071166992,
"rewards/margins": 0.39617669582366943,
"rewards/rejected": -1.771606683731079,
"step": 298
},
{
"epoch": 0.6391449565798263,
"grad_norm": 10.56963230174736,
"learning_rate": 3.454915028125263e-07,
"logits/chosen": -0.6784321665763855,
"logits/rejected": -0.6550740003585815,
"logps/chosen": -406.8092041015625,
"logps/rejected": -409.7593688964844,
"loss": 0.6048,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4954036474227905,
"rewards/margins": 0.18573154509067535,
"rewards/rejected": -1.6811351776123047,
"step": 299
},
{
"epoch": 0.6412825651302605,
"grad_norm": 10.005595422402594,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -0.7489890456199646,
"logits/rejected": -0.7623311281204224,
"logps/chosen": -440.9601135253906,
"logps/rejected": -441.59466552734375,
"loss": 0.6376,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6095386743545532,
"rewards/margins": 0.23035281896591187,
"rewards/rejected": -1.8398916721343994,
"step": 300
},
{
"epoch": 0.6412825651302605,
"eval_logits/chosen": -0.6758147478103638,
"eval_logits/rejected": -0.6752761006355286,
"eval_logps/chosen": -390.88177490234375,
"eval_logps/rejected": -425.3858947753906,
"eval_loss": 0.6177628040313721,
"eval_rewards/accuracies": 0.6747967600822449,
"eval_rewards/chosen": -1.3533374071121216,
"eval_rewards/margins": 0.28798195719718933,
"eval_rewards/rejected": -1.6413193941116333,
"eval_runtime": 377.299,
"eval_samples_per_second": 5.197,
"eval_steps_per_second": 0.326,
"step": 300
},
{
"epoch": 0.6434201736806947,
"grad_norm": 10.329473531690297,
"learning_rate": 3.3839517127277004e-07,
"logits/chosen": -0.7601391673088074,
"logits/rejected": -0.7844873070716858,
"logps/chosen": -393.47540283203125,
"logps/rejected": -435.7286071777344,
"loss": 0.6387,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.259373426437378,
"rewards/margins": 0.349065899848938,
"rewards/rejected": -1.608439326286316,
"step": 301
},
{
"epoch": 0.6455577822311289,
"grad_norm": 8.767557926350584,
"learning_rate": 3.348604690224166e-07,
"logits/chosen": -0.8301680088043213,
"logits/rejected": -0.8203250169754028,
"logps/chosen": -425.7115478515625,
"logps/rejected": -468.8160095214844,
"loss": 0.6113,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3414942026138306,
"rewards/margins": 0.44216763973236084,
"rewards/rejected": -1.7836618423461914,
"step": 302
},
{
"epoch": 0.6476953907815631,
"grad_norm": 11.252600228651138,
"learning_rate": 3.31335006308585e-07,
"logits/chosen": -0.7533825635910034,
"logits/rejected": -0.732757031917572,
"logps/chosen": -393.85040283203125,
"logps/rejected": -415.14080810546875,
"loss": 0.6301,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.3789258003234863,
"rewards/margins": 0.23148328065872192,
"rewards/rejected": -1.6104090213775635,
"step": 303
},
{
"epoch": 0.6498329993319973,
"grad_norm": 10.257975436900558,
"learning_rate": 3.2781898038048237e-07,
"logits/chosen": -0.6510428786277771,
"logits/rejected": -0.6685248613357544,
"logps/chosen": -390.3652038574219,
"logps/rejected": -393.8785400390625,
"loss": 0.6683,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5787415504455566,
"rewards/margins": 0.11641066521406174,
"rewards/rejected": -1.6951522827148438,
"step": 304
},
{
"epoch": 0.6519706078824316,
"grad_norm": 9.860462354210497,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -0.7366013526916504,
"logits/rejected": -0.7264673709869385,
"logps/chosen": -361.6595458984375,
"logps/rejected": -408.7252197265625,
"loss": 0.6058,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1662776470184326,
"rewards/margins": 0.39646124839782715,
"rewards/rejected": -1.5627388954162598,
"step": 305
},
{
"epoch": 0.6541082164328658,
"grad_norm": 11.932221586274338,
"learning_rate": 3.2081602522734985e-07,
"logits/chosen": -0.7773129343986511,
"logits/rejected": -0.7762659788131714,
"logps/chosen": -384.003662109375,
"logps/rejected": -423.2783203125,
"loss": 0.5892,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3898005485534668,
"rewards/margins": 0.3622281551361084,
"rewards/rejected": -1.7520288228988647,
"step": 306
},
{
"epoch": 0.6562458249833,
"grad_norm": 9.369038313539917,
"learning_rate": 3.173294878168025e-07,
"logits/chosen": -0.6643047332763672,
"logits/rejected": -0.6601549386978149,
"logps/chosen": -372.4691162109375,
"logps/rejected": -406.27996826171875,
"loss": 0.6158,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4243800640106201,
"rewards/margins": 0.29933756589889526,
"rewards/rejected": -1.7237175703048706,
"step": 307
},
{
"epoch": 0.6583834335337342,
"grad_norm": 10.15702054511366,
"learning_rate": 3.138531707990274e-07,
"logits/chosen": -0.6945326328277588,
"logits/rejected": -0.6813417673110962,
"logps/chosen": -367.9193115234375,
"logps/rejected": -422.1386413574219,
"loss": 0.5835,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1640138626098633,
"rewards/margins": 0.5344864726066589,
"rewards/rejected": -1.6985002756118774,
"step": 308
},
{
"epoch": 0.6605210420841683,
"grad_norm": 10.774899794292791,
"learning_rate": 3.1038726867353583e-07,
"logits/chosen": -0.678726315498352,
"logits/rejected": -0.706427276134491,
"logps/chosen": -402.2789001464844,
"logps/rejected": -475.16436767578125,
"loss": 0.5877,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.296879529953003,
"rewards/margins": 0.6074644327163696,
"rewards/rejected": -1.904344081878662,
"step": 309
},
{
"epoch": 0.6626586506346025,
"grad_norm": 9.326333800621224,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -0.7166895866394043,
"logits/rejected": -0.7176540493965149,
"logps/chosen": -386.4005432128906,
"logps/rejected": -395.38970947265625,
"loss": 0.6111,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.5514951944351196,
"rewards/margins": 0.11902564764022827,
"rewards/rejected": -1.6705207824707031,
"step": 310
},
{
"epoch": 0.6647962591850367,
"grad_norm": 11.49416505541279,
"learning_rate": 3.034874841730382e-07,
"logits/chosen": -0.7580830454826355,
"logits/rejected": -0.7336598634719849,
"logps/chosen": -402.9891052246094,
"logps/rejected": -430.2671813964844,
"loss": 0.6368,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3294634819030762,
"rewards/margins": 0.3222670555114746,
"rewards/rejected": -1.6517305374145508,
"step": 311
},
{
"epoch": 0.6669338677354709,
"grad_norm": 10.887373926899288,
"learning_rate": 3.000539878401296e-07,
"logits/chosen": -0.6197298765182495,
"logits/rejected": -0.5989848375320435,
"logps/chosen": -391.74951171875,
"logps/rejected": -449.1798400878906,
"loss": 0.6082,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4147597551345825,
"rewards/margins": 0.5554874539375305,
"rewards/rejected": -1.9702472686767578,
"step": 312
},
{
"epoch": 0.6690714762859051,
"grad_norm": 11.171229600938071,
"learning_rate": 2.9663167846209996e-07,
"logits/chosen": -0.6838382482528687,
"logits/rejected": -0.6743027567863464,
"logps/chosen": -368.8251037597656,
"logps/rejected": -415.6241149902344,
"loss": 0.6372,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.388074278831482,
"rewards/margins": 0.4560723900794983,
"rewards/rejected": -1.844146490097046,
"step": 313
},
{
"epoch": 0.6712090848363393,
"grad_norm": 10.489570114197578,
"learning_rate": 2.9322074751673974e-07,
"logits/chosen": -0.6488001346588135,
"logits/rejected": -0.6053016781806946,
"logps/chosen": -422.6211853027344,
"logps/rejected": -449.6383972167969,
"loss": 0.6577,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.6949717998504639,
"rewards/margins": 0.3523138463497162,
"rewards/rejected": -2.047285556793213,
"step": 314
},
{
"epoch": 0.6733466933867736,
"grad_norm": 9.335833798624803,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -0.7407481670379639,
"logits/rejected": -0.6984574794769287,
"logps/chosen": -426.7627868652344,
"logps/rejected": -433.4222106933594,
"loss": 0.6038,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5585707426071167,
"rewards/margins": 0.30426639318466187,
"rewards/rejected": -1.8628369569778442,
"step": 315
},
{
"epoch": 0.6754843019372078,
"grad_norm": 9.82583152531341,
"learning_rate": 2.864337836414018e-07,
"logits/chosen": -0.7897535562515259,
"logits/rejected": -0.7509832382202148,
"logps/chosen": -440.0413818359375,
"logps/rejected": -473.6156311035156,
"loss": 0.5877,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.778262972831726,
"rewards/margins": 0.3270663917064667,
"rewards/rejected": -2.1053295135498047,
"step": 316
},
{
"epoch": 0.677621910487642,
"grad_norm": 10.397708781784715,
"learning_rate": 2.8305813044122093e-07,
"logits/chosen": -0.5974478125572205,
"logits/rejected": -0.5807868242263794,
"logps/chosen": -366.0530090332031,
"logps/rejected": -355.85882568359375,
"loss": 0.6565,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3970279693603516,
"rewards/margins": 0.0002168789505958557,
"rewards/rejected": -1.3972446918487549,
"step": 317
},
{
"epoch": 0.6797595190380762,
"grad_norm": 10.464645314526035,
"learning_rate": 2.7969461511205806e-07,
"logits/chosen": -0.626457691192627,
"logits/rejected": -0.5530537366867065,
"logps/chosen": -330.521240234375,
"logps/rejected": -358.5421142578125,
"loss": 0.6146,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.4030221700668335,
"rewards/margins": 0.22237975895404816,
"rewards/rejected": -1.6254019737243652,
"step": 318
},
{
"epoch": 0.6818971275885104,
"grad_norm": 9.874403173091292,
"learning_rate": 2.763434258421836e-07,
"logits/chosen": -0.7100091576576233,
"logits/rejected": -0.6709161996841431,
"logps/chosen": -342.3360595703125,
"logps/rejected": -356.4312744140625,
"loss": 0.6294,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2349555492401123,
"rewards/margins": 0.18816961348056793,
"rewards/rejected": -1.4231250286102295,
"step": 319
},
{
"epoch": 0.6840347361389446,
"grad_norm": 10.416140198085172,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -0.7924367785453796,
"logits/rejected": -0.7890709638595581,
"logps/chosen": -402.750244140625,
"logps/rejected": -433.9951171875,
"loss": 0.5975,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3218122720718384,
"rewards/margins": 0.32942885160446167,
"rewards/rejected": -1.6512411832809448,
"step": 320
},
{
"epoch": 0.6861723446893787,
"grad_norm": 10.264424479929405,
"learning_rate": 2.696787747746839e-07,
"logits/chosen": -0.7326480150222778,
"logits/rejected": -0.727679967880249,
"logps/chosen": -335.9344177246094,
"logps/rejected": -376.9026794433594,
"loss": 0.6166,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2939304113388062,
"rewards/margins": 0.3992197811603546,
"rewards/rejected": -1.6931501626968384,
"step": 321
},
{
"epoch": 0.6883099532398129,
"grad_norm": 9.672418793392822,
"learning_rate": 2.6636568586346897e-07,
"logits/chosen": -0.7330962419509888,
"logits/rejected": -0.7231791615486145,
"logps/chosen": -344.6290588378906,
"logps/rejected": -368.3528137207031,
"loss": 0.6236,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1829473972320557,
"rewards/margins": 0.23583151400089264,
"rewards/rejected": -1.4187790155410767,
"step": 322
},
{
"epoch": 0.6904475617902471,
"grad_norm": 12.909094410970068,
"learning_rate": 2.6306566876350067e-07,
"logits/chosen": -0.7223283648490906,
"logits/rejected": -0.6862327456474304,
"logps/chosen": -427.14727783203125,
"logps/rejected": -453.6234436035156,
"loss": 0.5843,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5469043254852295,
"rewards/margins": 0.278840035200119,
"rewards/rejected": -1.825744390487671,
"step": 323
},
{
"epoch": 0.6925851703406813,
"grad_norm": 13.707807531422917,
"learning_rate": 2.597789081103313e-07,
"logits/chosen": -0.7629610300064087,
"logits/rejected": -0.727975070476532,
"logps/chosen": -382.91278076171875,
"logps/rejected": -421.9703369140625,
"loss": 0.5563,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4632904529571533,
"rewards/margins": 0.4552845358848572,
"rewards/rejected": -1.9185751676559448,
"step": 324
},
{
"epoch": 0.6947227788911156,
"grad_norm": 10.588462296925226,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -0.621019184589386,
"logits/rejected": -0.5743827223777771,
"logps/chosen": -433.55267333984375,
"logps/rejected": -461.3088073730469,
"loss": 0.6159,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.6322648525238037,
"rewards/margins": 0.2886176109313965,
"rewards/rejected": -1.9208825826644897,
"step": 325
},
{
"epoch": 0.6968603874415498,
"grad_norm": 9.6025165386732,
"learning_rate": 2.5324589096782656e-07,
"logits/chosen": -0.6759508848190308,
"logits/rejected": -0.6631283760070801,
"logps/chosen": -414.1610107421875,
"logps/rejected": -420.1568603515625,
"loss": 0.6298,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.416655421257019,
"rewards/margins": 0.17679718136787415,
"rewards/rejected": -1.5934526920318604,
"step": 326
},
{
"epoch": 0.698997995991984,
"grad_norm": 12.80932336769188,
"learning_rate": 2.500000000000001e-07,
"logits/chosen": -0.6499335765838623,
"logits/rejected": -0.662979245185852,
"logps/chosen": -405.96063232421875,
"logps/rejected": -447.20172119140625,
"loss": 0.6288,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4347225427627563,
"rewards/margins": 0.31777456402778625,
"rewards/rejected": -1.7524970769882202,
"step": 327
},
{
"epoch": 0.7011356045424182,
"grad_norm": 12.08919140781653,
"learning_rate": 2.467680965015387e-07,
"logits/chosen": -0.7271804213523865,
"logits/rejected": -0.7305589914321899,
"logps/chosen": -362.54632568359375,
"logps/rejected": -384.875,
"loss": 0.634,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2877211570739746,
"rewards/margins": 0.24846753478050232,
"rewards/rejected": -1.5361886024475098,
"step": 328
},
{
"epoch": 0.7032732130928524,
"grad_norm": 10.873874736167313,
"learning_rate": 2.4355036129704696e-07,
"logits/chosen": -0.6805239915847778,
"logits/rejected": -0.6776773929595947,
"logps/chosen": -472.7155456542969,
"logps/rejected": -522.7994384765625,
"loss": 0.6205,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.6964683532714844,
"rewards/margins": 0.3618759214878082,
"rewards/rejected": -2.0583443641662598,
"step": 329
},
{
"epoch": 0.7054108216432866,
"grad_norm": 16.081627749911508,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -0.7133939266204834,
"logits/rejected": -0.7143837809562683,
"logps/chosen": -382.49078369140625,
"logps/rejected": -437.5693664550781,
"loss": 0.6097,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.266930341720581,
"rewards/margins": 0.49213629961013794,
"rewards/rejected": -1.7590665817260742,
"step": 330
},
{
"epoch": 0.7075484301937208,
"grad_norm": 10.322962661870067,
"learning_rate": 2.371581150947476e-07,
"logits/chosen": -0.8041883707046509,
"logits/rejected": -0.8093154430389404,
"logps/chosen": -430.856689453125,
"logps/rejected": -477.88787841796875,
"loss": 0.6063,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3982198238372803,
"rewards/margins": 0.3465649485588074,
"rewards/rejected": -1.7447847127914429,
"step": 331
},
{
"epoch": 0.7096860387441549,
"grad_norm": 10.446630163251449,
"learning_rate": 2.3398396174233176e-07,
"logits/chosen": -0.6520624160766602,
"logits/rejected": -0.6437772512435913,
"logps/chosen": -422.0386047363281,
"logps/rejected": -486.0862731933594,
"loss": 0.6174,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6643366813659668,
"rewards/margins": 0.3572143316268921,
"rewards/rejected": -2.0215511322021484,
"step": 332
},
{
"epoch": 0.7118236472945891,
"grad_norm": 11.809866444989042,
"learning_rate": 2.3082469195465893e-07,
"logits/chosen": -0.7520323395729065,
"logits/rejected": -0.7196107506752014,
"logps/chosen": -411.33251953125,
"logps/rejected": -455.638916015625,
"loss": 0.5696,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6863501071929932,
"rewards/margins": 0.3991457223892212,
"rewards/rejected": -2.085495710372925,
"step": 333
},
{
"epoch": 0.7139612558450233,
"grad_norm": 11.4706777134489,
"learning_rate": 2.2768048249248644e-07,
"logits/chosen": -0.6395952105522156,
"logits/rejected": -0.612390398979187,
"logps/chosen": -408.4999084472656,
"logps/rejected": -444.7389831542969,
"loss": 0.6339,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7306082248687744,
"rewards/margins": 0.3128102421760559,
"rewards/rejected": -2.0434184074401855,
"step": 334
},
{
"epoch": 0.7160988643954576,
"grad_norm": 10.253605586246742,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -0.6910028457641602,
"logits/rejected": -0.6887121200561523,
"logps/chosen": -373.79571533203125,
"logps/rejected": -457.9923095703125,
"loss": 0.6146,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5869219303131104,
"rewards/margins": 0.5055859088897705,
"rewards/rejected": -2.0925076007843018,
"step": 335
},
{
"epoch": 0.7182364729458918,
"grad_norm": 10.09520238328347,
"learning_rate": 2.2143794736471388e-07,
"logits/chosen": -0.7225451469421387,
"logits/rejected": -0.7483439445495605,
"logps/chosen": -484.85748291015625,
"logps/rejected": -529.5263061523438,
"loss": 0.6224,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8494200706481934,
"rewards/margins": 0.2027570605278015,
"rewards/rejected": -2.0521771907806396,
"step": 336
},
{
"epoch": 0.720374081496326,
"grad_norm": 10.071232143800623,
"learning_rate": 2.1833997096818895e-07,
"logits/chosen": -0.5754382610321045,
"logits/rejected": -0.5392119288444519,
"logps/chosen": -344.8245544433594,
"logps/rejected": -379.4691162109375,
"loss": 0.6219,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2857069969177246,
"rewards/margins": 0.34328341484069824,
"rewards/rejected": -1.6289904117584229,
"step": 337
},
{
"epoch": 0.7225116900467602,
"grad_norm": 10.65729584502251,
"learning_rate": 2.1525775341577402e-07,
"logits/chosen": -0.6606283187866211,
"logits/rejected": -0.6608355045318604,
"logps/chosen": -414.2405700683594,
"logps/rejected": -429.601806640625,
"loss": 0.5947,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3188872337341309,
"rewards/margins": 0.19126060605049133,
"rewards/rejected": -1.5101479291915894,
"step": 338
},
{
"epoch": 0.7246492985971944,
"grad_norm": 10.538063937615522,
"learning_rate": 2.121914671571633e-07,
"logits/chosen": -0.7743428945541382,
"logits/rejected": -0.7525985836982727,
"logps/chosen": -367.4284973144531,
"logps/rejected": -441.0301818847656,
"loss": 0.6129,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.3432848453521729,
"rewards/margins": 0.6236953735351562,
"rewards/rejected": -1.9669800996780396,
"step": 339
},
{
"epoch": 0.7267869071476286,
"grad_norm": 9.94512421358411,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -0.7715132236480713,
"logits/rejected": -0.7799488306045532,
"logps/chosen": -431.17169189453125,
"logps/rejected": -494.21197509765625,
"loss": 0.5912,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5074207782745361,
"rewards/margins": 0.48499661684036255,
"rewards/rejected": -1.9924174547195435,
"step": 340
},
{
"epoch": 0.7289245156980628,
"grad_norm": 12.36515548240608,
"learning_rate": 2.0610737385376348e-07,
"logits/chosen": -0.7136672139167786,
"logits/rejected": -0.6798695921897888,
"logps/chosen": -405.33905029296875,
"logps/rejected": -437.63330078125,
"loss": 0.5978,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4325660467147827,
"rewards/margins": 0.23246146738529205,
"rewards/rejected": -1.6650276184082031,
"step": 341
},
{
"epoch": 0.731062124248497,
"grad_norm": 10.594170284983056,
"learning_rate": 2.0308990721324926e-07,
"logits/chosen": -0.6517391800880432,
"logits/rejected": -0.6472780108451843,
"logps/chosen": -456.90570068359375,
"logps/rejected": -480.61138916015625,
"loss": 0.5951,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.691187858581543,
"rewards/margins": 0.42632347345352173,
"rewards/rejected": -2.117511034011841,
"step": 342
},
{
"epoch": 0.7331997327989312,
"grad_norm": 11.483481596708172,
"learning_rate": 2.0008905265604315e-07,
"logits/chosen": -0.7073544263839722,
"logits/rejected": -0.6990326642990112,
"logps/chosen": -409.7100524902344,
"logps/rejected": -450.9366455078125,
"loss": 0.5441,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6152515411376953,
"rewards/margins": 0.3995630741119385,
"rewards/rejected": -2.014814853668213,
"step": 343
},
{
"epoch": 0.7353373413493653,
"grad_norm": 13.346456381620781,
"learning_rate": 1.971049780795901e-07,
"logits/chosen": -0.7003156542778015,
"logits/rejected": -0.6687884330749512,
"logps/chosen": -310.5570373535156,
"logps/rejected": -344.05859375,
"loss": 0.6145,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0297322273254395,
"rewards/margins": 0.3454705476760864,
"rewards/rejected": -1.3752026557922363,
"step": 344
},
{
"epoch": 0.7374749498997996,
"grad_norm": 10.139219194086207,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -0.6944881677627563,
"logits/rejected": -0.6632839441299438,
"logps/chosen": -381.2460632324219,
"logps/rejected": -414.730712890625,
"loss": 0.5746,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.378089427947998,
"rewards/margins": 0.35800689458847046,
"rewards/rejected": -1.7360961437225342,
"step": 345
},
{
"epoch": 0.7396125584502338,
"grad_norm": 9.84020912855359,
"learning_rate": 1.9118783575519109e-07,
"logits/chosen": -0.7444390058517456,
"logits/rejected": -0.7687693238258362,
"logps/chosen": -441.13104248046875,
"logps/rejected": -471.73797607421875,
"loss": 0.6159,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.675041675567627,
"rewards/margins": 0.1502073109149933,
"rewards/rejected": -1.8252489566802979,
"step": 346
},
{
"epoch": 0.741750167000668,
"grad_norm": 11.291373399374436,
"learning_rate": 1.8825509907063326e-07,
"logits/chosen": -0.7405213117599487,
"logits/rejected": -0.7411251068115234,
"logps/chosen": -346.1521301269531,
"logps/rejected": -372.3758850097656,
"loss": 0.6207,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4540354013442993,
"rewards/margins": 0.28074249625205994,
"rewards/rejected": -1.7347780466079712,
"step": 347
},
{
"epoch": 0.7438877755511022,
"grad_norm": 9.478462210282277,
"learning_rate": 1.8533980447508135e-07,
"logits/chosen": -0.7745504975318909,
"logits/rejected": -0.7580114603042603,
"logps/chosen": -364.4132995605469,
"logps/rejected": -376.5552978515625,
"loss": 0.6103,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.342382550239563,
"rewards/margins": 0.21273840963840485,
"rewards/rejected": -1.5551210641860962,
"step": 348
},
{
"epoch": 0.7460253841015364,
"grad_norm": 11.376031296146607,
"learning_rate": 1.824421150789106e-07,
"logits/chosen": -0.588141918182373,
"logits/rejected": -0.6058573126792908,
"logps/chosen": -402.21026611328125,
"logps/rejected": -441.880615234375,
"loss": 0.6202,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4959633350372314,
"rewards/margins": 0.359602689743042,
"rewards/rejected": -1.8555659055709839,
"step": 349
},
{
"epoch": 0.7481629926519706,
"grad_norm": 9.3095000239465,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -0.7804590463638306,
"logits/rejected": -0.768570601940155,
"logps/chosen": -395.44970703125,
"logps/rejected": -442.4303894042969,
"loss": 0.5622,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3593695163726807,
"rewards/margins": 0.4249880313873291,
"rewards/rejected": -1.7843575477600098,
"step": 350
},
{
"epoch": 0.7503006012024048,
"grad_norm": 10.971792168406296,
"learning_rate": 1.7670019939210023e-07,
"logits/chosen": -0.6696098446846008,
"logits/rejected": -0.6669338941574097,
"logps/chosen": -451.68768310546875,
"logps/rejected": -497.6464538574219,
"loss": 0.5897,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7285881042480469,
"rewards/margins": 0.4477997124195099,
"rewards/rejected": -2.1763877868652344,
"step": 351
},
{
"epoch": 0.752438209752839,
"grad_norm": 11.419971323161318,
"learning_rate": 1.7385629436093956e-07,
"logits/chosen": -0.6907357573509216,
"logits/rejected": -0.637013852596283,
"logps/chosen": -432.1939392089844,
"logps/rejected": -469.10162353515625,
"loss": 0.6008,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7298922538757324,
"rewards/margins": 0.38807937502861023,
"rewards/rejected": -2.117971420288086,
"step": 352
},
{
"epoch": 0.7545758183032732,
"grad_norm": 12.388342839443734,
"learning_rate": 1.710306370301437e-07,
"logits/chosen": -0.7042302489280701,
"logits/rejected": -0.7210839986801147,
"logps/chosen": -481.449951171875,
"logps/rejected": -541.4148559570312,
"loss": 0.6228,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7111109495162964,
"rewards/margins": 0.5329866409301758,
"rewards/rejected": -2.2440977096557617,
"step": 353
},
{
"epoch": 0.7567134268537075,
"grad_norm": 10.900263207759233,
"learning_rate": 1.6822338549489446e-07,
"logits/chosen": -0.6276527047157288,
"logits/rejected": -0.6185672879219055,
"logps/chosen": -353.99462890625,
"logps/rejected": -390.6813659667969,
"loss": 0.5823,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.318265438079834,
"rewards/margins": 0.3258642554283142,
"rewards/rejected": -1.6441295146942139,
"step": 354
},
{
"epoch": 0.7588510354041417,
"grad_norm": 11.630075473409493,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -0.7092128992080688,
"logits/rejected": -0.6994844079017639,
"logps/chosen": -449.5421142578125,
"logps/rejected": -491.48406982421875,
"loss": 0.5833,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.727628231048584,
"rewards/margins": 0.27954497933387756,
"rewards/rejected": -2.0071730613708496,
"step": 355
},
{
"epoch": 0.7609886439545758,
"grad_norm": 11.035433473738337,
"learning_rate": 1.6266472703396284e-07,
"logits/chosen": -0.801999568939209,
"logits/rejected": -0.7807914614677429,
"logps/chosen": -436.8575439453125,
"logps/rejected": -460.9193115234375,
"loss": 0.5715,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5792278051376343,
"rewards/margins": 0.3968886435031891,
"rewards/rejected": -1.976116418838501,
"step": 356
},
{
"epoch": 0.76312625250501,
"grad_norm": 11.722081233691071,
"learning_rate": 1.599136311145402e-07,
"logits/chosen": -0.6747885942459106,
"logits/rejected": -0.618495523929596,
"logps/chosen": -422.78729248046875,
"logps/rejected": -472.6419372558594,
"loss": 0.6287,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.606202244758606,
"rewards/margins": 0.4832748472690582,
"rewards/rejected": -2.089477062225342,
"step": 357
},
{
"epoch": 0.7652638610554442,
"grad_norm": 12.64820349502746,
"learning_rate": 1.5718156298578288e-07,
"logits/chosen": -0.7273571491241455,
"logits/rejected": -0.6881564855575562,
"logps/chosen": -425.4215087890625,
"logps/rejected": -444.314208984375,
"loss": 0.628,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.6894898414611816,
"rewards/margins": 0.1857471764087677,
"rewards/rejected": -1.875237226486206,
"step": 358
},
{
"epoch": 0.7674014696058784,
"grad_norm": 10.96328639456088,
"learning_rate": 1.5446867550656767e-07,
"logits/chosen": -0.6399669647216797,
"logits/rejected": -0.6358177661895752,
"logps/chosen": -372.887939453125,
"logps/rejected": -401.3673095703125,
"loss": 0.5816,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.543872356414795,
"rewards/margins": 0.2215932011604309,
"rewards/rejected": -1.7654657363891602,
"step": 359
},
{
"epoch": 0.7695390781563126,
"grad_norm": 11.515812302503546,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -0.7418455481529236,
"logits/rejected": -0.6992334127426147,
"logps/chosen": -440.9914245605469,
"logps/rejected": -484.72882080078125,
"loss": 0.5918,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.6546094417572021,
"rewards/margins": 0.40860795974731445,
"rewards/rejected": -2.0632174015045166,
"step": 360
},
{
"epoch": 0.7716766867067468,
"grad_norm": 11.301203400803265,
"learning_rate": 1.4910104855800426e-07,
"logits/chosen": -0.5830298066139221,
"logits/rejected": -0.541452944278717,
"logps/chosen": -428.9151611328125,
"logps/rejected": -450.665283203125,
"loss": 0.6244,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6960718631744385,
"rewards/margins": 0.26461082696914673,
"rewards/rejected": -1.96068274974823,
"step": 361
},
{
"epoch": 0.773814295257181,
"grad_norm": 10.448871439187576,
"learning_rate": 1.4644660940672627e-07,
"logits/chosen": -0.643266499042511,
"logits/rejected": -0.6516848802566528,
"logps/chosen": -382.15582275390625,
"logps/rejected": -422.58001708984375,
"loss": 0.6302,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.693763017654419,
"rewards/margins": 0.31615814566612244,
"rewards/rejected": -2.0099213123321533,
"step": 362
},
{
"epoch": 0.7759519038076153,
"grad_norm": 12.065289362336179,
"learning_rate": 1.4381195152432769e-07,
"logits/chosen": -0.7809977531433105,
"logits/rejected": -0.7569341659545898,
"logps/chosen": -402.4347229003906,
"logps/rejected": -426.5815124511719,
"loss": 0.6014,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4668489694595337,
"rewards/margins": 0.27162590622901917,
"rewards/rejected": -1.7384748458862305,
"step": 363
},
{
"epoch": 0.7780895123580495,
"grad_norm": 11.450406850059426,
"learning_rate": 1.4119722231959403e-07,
"logits/chosen": -0.7261683940887451,
"logits/rejected": -0.7380213737487793,
"logps/chosen": -320.5738830566406,
"logps/rejected": -376.2132568359375,
"loss": 0.6148,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1737643480300903,
"rewards/margins": 0.4349837005138397,
"rewards/rejected": -1.608747959136963,
"step": 364
},
{
"epoch": 0.7802271209084837,
"grad_norm": 10.17075140486933,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -0.6793671250343323,
"logits/rejected": -0.6769421100616455,
"logps/chosen": -396.4522705078125,
"logps/rejected": -426.5015869140625,
"loss": 0.5761,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5801575183868408,
"rewards/margins": 0.23419132828712463,
"rewards/rejected": -1.8143486976623535,
"step": 365
},
{
"epoch": 0.7823647294589179,
"grad_norm": 9.496259172803326,
"learning_rate": 1.3602813399504458e-07,
"logits/chosen": -0.7178226113319397,
"logits/rejected": -0.7088046073913574,
"logps/chosen": -362.4988098144531,
"logps/rejected": -413.68255615234375,
"loss": 0.5697,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4096518754959106,
"rewards/margins": 0.4560312330722809,
"rewards/rejected": -1.8656830787658691,
"step": 366
},
{
"epoch": 0.784502338009352,
"grad_norm": 11.467183173889156,
"learning_rate": 1.3347406408508694e-07,
"logits/chosen": -0.58012455701828,
"logits/rejected": -0.6086165308952332,
"logps/chosen": -381.5002746582031,
"logps/rejected": -446.1846618652344,
"loss": 0.5768,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.546051263809204,
"rewards/margins": 0.5468287467956543,
"rewards/rejected": -2.0928800106048584,
"step": 367
},
{
"epoch": 0.7866399465597862,
"grad_norm": 11.770361743077546,
"learning_rate": 1.3094050125632972e-07,
"logits/chosen": -0.665503978729248,
"logits/rejected": -0.6807020902633667,
"logps/chosen": -339.297119140625,
"logps/rejected": -378.72283935546875,
"loss": 0.6007,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.239609718322754,
"rewards/margins": 0.311847984790802,
"rewards/rejected": -1.5514576435089111,
"step": 368
},
{
"epoch": 0.7887775551102204,
"grad_norm": 11.239777792633861,
"learning_rate": 1.284275872613028e-07,
"logits/chosen": -0.7516641020774841,
"logits/rejected": -0.7523844242095947,
"logps/chosen": -465.70562744140625,
"logps/rejected": -494.3858642578125,
"loss": 0.5955,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.708259105682373,
"rewards/margins": 0.19142737984657288,
"rewards/rejected": -1.899686336517334,
"step": 369
},
{
"epoch": 0.7909151636606546,
"grad_norm": 12.24207530779827,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -0.7178503274917603,
"logits/rejected": -0.7465229630470276,
"logps/chosen": -350.14300537109375,
"logps/rejected": -426.7923583984375,
"loss": 0.5556,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.3200982809066772,
"rewards/margins": 0.5981042981147766,
"rewards/rejected": -1.918202519416809,
"step": 370
},
{
"epoch": 0.7930527722110888,
"grad_norm": 11.476654461821495,
"learning_rate": 1.2346426699819456e-07,
"logits/chosen": -0.6654431223869324,
"logits/rejected": -0.6413010954856873,
"logps/chosen": -432.3926086425781,
"logps/rejected": -445.0782165527344,
"loss": 0.6153,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.8410680294036865,
"rewards/margins": 0.26301610469818115,
"rewards/rejected": -2.104084014892578,
"step": 371
},
{
"epoch": 0.795190380761523,
"grad_norm": 11.015669950808952,
"learning_rate": 1.2101413842727343e-07,
"logits/chosen": -0.748419463634491,
"logits/rejected": -0.7465101480484009,
"logps/chosen": -404.2447204589844,
"logps/rejected": -458.7890625,
"loss": 0.6227,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4635306596755981,
"rewards/margins": 0.4813007712364197,
"rewards/rejected": -1.9448314905166626,
"step": 372
},
{
"epoch": 0.7973279893119573,
"grad_norm": 11.968874819239444,
"learning_rate": 1.1858521406886674e-07,
"logits/chosen": -0.6935529112815857,
"logits/rejected": -0.6768806576728821,
"logps/chosen": -479.6001892089844,
"logps/rejected": -526.9801025390625,
"loss": 0.5949,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.146116018295288,
"rewards/margins": 0.5019779205322266,
"rewards/rejected": -2.6480939388275146,
"step": 373
},
{
"epoch": 0.7994655978623915,
"grad_norm": 11.31673592574301,
"learning_rate": 1.1617762982099444e-07,
"logits/chosen": -0.7199594974517822,
"logits/rejected": -0.7195298671722412,
"logps/chosen": -390.56695556640625,
"logps/rejected": -437.9982604980469,
"loss": 0.6259,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.6399167776107788,
"rewards/margins": 0.4196929931640625,
"rewards/rejected": -2.0596096515655518,
"step": 374
},
{
"epoch": 0.8016032064128257,
"grad_norm": 11.65245860510705,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -0.6417936086654663,
"logits/rejected": -0.5881288051605225,
"logps/chosen": -462.2901611328125,
"logps/rejected": -533.3080444335938,
"loss": 0.639,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8019856214523315,
"rewards/margins": 0.778126060962677,
"rewards/rejected": -2.580111503601074,
"step": 375
},
{
"epoch": 0.8037408149632599,
"grad_norm": 11.237123066893254,
"learning_rate": 1.1142701927151454e-07,
"logits/chosen": -0.742131233215332,
"logits/rejected": -0.7236477136611938,
"logps/chosen": -440.7339782714844,
"logps/rejected": -468.85723876953125,
"loss": 0.622,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.7191107273101807,
"rewards/margins": 0.3171979784965515,
"rewards/rejected": -2.036308765411377,
"step": 376
},
{
"epoch": 0.8058784235136941,
"grad_norm": 12.182574925046193,
"learning_rate": 1.090842587659851e-07,
"logits/chosen": -0.6230757832527161,
"logits/rejected": -0.6275469064712524,
"logps/chosen": -345.8181457519531,
"logps/rejected": -382.3629150390625,
"loss": 0.6094,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.315040111541748,
"rewards/margins": 0.3147667646408081,
"rewards/rejected": -1.6298067569732666,
"step": 377
},
{
"epoch": 0.8080160320641283,
"grad_norm": 12.095497229280761,
"learning_rate": 1.0676336994827512e-07,
"logits/chosen": -0.8505545258522034,
"logits/rejected": -0.8231047987937927,
"logps/chosen": -439.4098205566406,
"logps/rejected": -450.57861328125,
"loss": 0.5882,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8718527555465698,
"rewards/margins": 0.0610598549246788,
"rewards/rejected": -1.9329125881195068,
"step": 378
},
{
"epoch": 0.8101536406145624,
"grad_norm": 12.306182802408912,
"learning_rate": 1.044644826718295e-07,
"logits/chosen": -0.6553314924240112,
"logits/rejected": -0.6298251152038574,
"logps/chosen": -428.9188537597656,
"logps/rejected": -464.73126220703125,
"loss": 0.5659,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.740647554397583,
"rewards/margins": 0.34435731172561646,
"rewards/rejected": -2.0850048065185547,
"step": 379
},
{
"epoch": 0.8122912491649966,
"grad_norm": 11.84427292451934,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -0.6922661662101746,
"logits/rejected": -0.7002755999565125,
"logps/chosen": -382.30987548828125,
"logps/rejected": -423.1776123046875,
"loss": 0.6365,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.37299644947052,
"rewards/margins": 0.28076884150505066,
"rewards/rejected": -1.6537654399871826,
"step": 380
},
{
"epoch": 0.8144288577154308,
"grad_norm": 11.204366574978794,
"learning_rate": 9.99332259943969e-08,
"logits/chosen": -0.7378983497619629,
"logits/rejected": -0.7215259075164795,
"logps/chosen": -465.00885009765625,
"logps/rejected": -522.8477783203125,
"loss": 0.6099,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6207081079483032,
"rewards/margins": 0.5316731333732605,
"rewards/rejected": -2.152381420135498,
"step": 381
},
{
"epoch": 0.816566466265865,
"grad_norm": 10.577264704091782,
"learning_rate": 9.770111011666582e-08,
"logits/chosen": -0.7259981632232666,
"logits/rejected": -0.7045480012893677,
"logps/chosen": -428.8095703125,
"logps/rejected": -492.239013671875,
"loss": 0.6087,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5366865396499634,
"rewards/margins": 0.7509114146232605,
"rewards/rejected": -2.287597894668579,
"step": 382
},
{
"epoch": 0.8187040748162993,
"grad_norm": 12.483677889539976,
"learning_rate": 9.549150281252632e-08,
"logits/chosen": -0.6887928247451782,
"logits/rejected": -0.6907156705856323,
"logps/chosen": -352.9273681640625,
"logps/rejected": -383.6487121582031,
"loss": 0.6259,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.5845268964767456,
"rewards/margins": 0.28942757844924927,
"rewards/rejected": -1.8739545345306396,
"step": 383
},
{
"epoch": 0.8208416833667335,
"grad_norm": 19.23190107186564,
"learning_rate": 9.330452770923603e-08,
"logits/chosen": -0.762394905090332,
"logits/rejected": -0.7647604942321777,
"logps/chosen": -451.6494140625,
"logps/rejected": -534.89892578125,
"loss": 0.5934,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.8478323221206665,
"rewards/margins": 0.6826062798500061,
"rewards/rejected": -2.5304384231567383,
"step": 384
},
{
"epoch": 0.8229792919171677,
"grad_norm": 12.247364252908152,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -0.7505077123641968,
"logits/rejected": -0.7758923768997192,
"logps/chosen": -470.6575927734375,
"logps/rejected": -503.64556884765625,
"loss": 0.6397,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8584275245666504,
"rewards/margins": 0.3208252787590027,
"rewards/rejected": -2.179252862930298,
"step": 385
},
{
"epoch": 0.8251169004676019,
"grad_norm": 11.383815632835855,
"learning_rate": 8.899896227604508e-08,
"logits/chosen": -0.6819490194320679,
"logits/rejected": -0.6731836199760437,
"logps/chosen": -433.1306457519531,
"logps/rejected": -487.12646484375,
"loss": 0.6317,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.6253031492233276,
"rewards/margins": 0.4985421299934387,
"rewards/rejected": -2.123845100402832,
"step": 386
},
{
"epoch": 0.8272545090180361,
"grad_norm": 13.189195026919496,
"learning_rate": 8.688061284200265e-08,
"logits/chosen": -0.6536362171173096,
"logits/rejected": -0.6316641569137573,
"logps/chosen": -447.10577392578125,
"logps/rejected": -500.36700439453125,
"loss": 0.6544,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.6854324340820312,
"rewards/margins": 0.3739916980266571,
"rewards/rejected": -2.0594239234924316,
"step": 387
},
{
"epoch": 0.8293921175684703,
"grad_norm": 12.05654473393893,
"learning_rate": 8.478537738704811e-08,
"logits/chosen": -0.7113953232765198,
"logits/rejected": -0.6980003118515015,
"logps/chosen": -437.1040344238281,
"logps/rejected": -477.0093078613281,
"loss": 0.5797,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7935041189193726,
"rewards/margins": 0.35914352536201477,
"rewards/rejected": -2.1526474952697754,
"step": 388
},
{
"epoch": 0.8315297261189045,
"grad_norm": 11.698252029580289,
"learning_rate": 8.271337313934867e-08,
"logits/chosen": -0.624556839466095,
"logits/rejected": -0.6502059698104858,
"logps/chosen": -414.85882568359375,
"logps/rejected": -456.1212158203125,
"loss": 0.6072,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6804428100585938,
"rewards/margins": 0.3041376769542694,
"rewards/rejected": -1.9845805168151855,
"step": 389
},
{
"epoch": 0.8336673346693386,
"grad_norm": 12.14771475451631,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -0.6798664331436157,
"logits/rejected": -0.6738008260726929,
"logps/chosen": -411.55047607421875,
"logps/rejected": -457.5155334472656,
"loss": 0.5922,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4717459678649902,
"rewards/margins": 0.5215471386909485,
"rewards/rejected": -1.993293046951294,
"step": 390
},
{
"epoch": 0.8358049432197728,
"grad_norm": 12.796578845217505,
"learning_rate": 7.863952067298041e-08,
"logits/chosen": -0.5822688937187195,
"logits/rejected": -0.564083993434906,
"logps/chosen": -431.5522155761719,
"logps/rejected": -450.26739501953125,
"loss": 0.628,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9067468643188477,
"rewards/margins": 0.21568900346755981,
"rewards/rejected": -2.1224358081817627,
"step": 391
},
{
"epoch": 0.837942551770207,
"grad_norm": 11.159935748642301,
"learning_rate": 7.663790038585794e-08,
"logits/chosen": -0.662575364112854,
"logits/rejected": -0.6590286493301392,
"logps/chosen": -444.98162841796875,
"logps/rejected": -497.9795227050781,
"loss": 0.5731,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.7177234888076782,
"rewards/margins": 0.6272789239883423,
"rewards/rejected": -2.3450024127960205,
"step": 392
},
{
"epoch": 0.8400801603206413,
"grad_norm": 14.31633694385083,
"learning_rate": 7.465996715633027e-08,
"logits/chosen": -0.6459007263183594,
"logits/rejected": -0.6448737382888794,
"logps/chosen": -397.7703552246094,
"logps/rejected": -440.38238525390625,
"loss": 0.5945,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7266194820404053,
"rewards/margins": 0.44395869970321655,
"rewards/rejected": -2.1705780029296875,
"step": 393
},
{
"epoch": 0.8422177688710755,
"grad_norm": 11.224103010133572,
"learning_rate": 7.270583164951926e-08,
"logits/chosen": -0.6865531206130981,
"logits/rejected": -0.6968246698379517,
"logps/chosen": -354.6371154785156,
"logps/rejected": -424.811279296875,
"loss": 0.6334,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.536527156829834,
"rewards/margins": 0.5009466409683228,
"rewards/rejected": -2.0374739170074463,
"step": 394
},
{
"epoch": 0.8443553774215097,
"grad_norm": 10.826285808287984,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -0.6569056510925293,
"logits/rejected": -0.6044581532478333,
"logps/chosen": -360.92681884765625,
"logps/rejected": -372.06512451171875,
"loss": 0.6061,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.6241035461425781,
"rewards/margins": 0.13333997130393982,
"rewards/rejected": -1.7574434280395508,
"step": 395
},
{
"epoch": 0.8464929859719439,
"grad_norm": 11.121731952204106,
"learning_rate": 6.886938980101869e-08,
"logits/chosen": -0.6959440112113953,
"logits/rejected": -0.6976322531700134,
"logps/chosen": -481.72747802734375,
"logps/rejected": -528.6837768554688,
"loss": 0.5492,
"rewards/accuracies": 0.90625,
"rewards/chosen": -1.7375783920288086,
"rewards/margins": 0.5362969040870667,
"rewards/rejected": -2.2738752365112305,
"step": 396
},
{
"epoch": 0.8486305945223781,
"grad_norm": 11.676543714442664,
"learning_rate": 6.698729810778064e-08,
"logits/chosen": -0.7131574153900146,
"logits/rejected": -0.6955525875091553,
"logps/chosen": -399.06610107421875,
"logps/rejected": -414.4498291015625,
"loss": 0.5949,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6074295043945312,
"rewards/margins": 0.191550150513649,
"rewards/rejected": -1.798979640007019,
"step": 397
},
{
"epoch": 0.8507682030728123,
"grad_norm": 11.148999020679877,
"learning_rate": 6.512943342215232e-08,
"logits/chosen": -0.7562680244445801,
"logits/rejected": -0.779510498046875,
"logps/chosen": -484.00506591796875,
"logps/rejected": -511.72882080078125,
"loss": 0.5846,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8436009883880615,
"rewards/margins": 0.35308870673179626,
"rewards/rejected": -2.1966898441314697,
"step": 398
},
{
"epoch": 0.8529058116232465,
"grad_norm": 11.997120047866387,
"learning_rate": 6.329589969143517e-08,
"logits/chosen": -0.6792132258415222,
"logits/rejected": -0.6694210171699524,
"logps/chosen": -424.2513427734375,
"logps/rejected": -454.1995849609375,
"loss": 0.5749,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.7970068454742432,
"rewards/margins": 0.36387020349502563,
"rewards/rejected": -2.160876750946045,
"step": 399
},
{
"epoch": 0.8550434201736807,
"grad_norm": 11.238178437853232,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -0.6610137820243835,
"logits/rejected": -0.6665123105049133,
"logps/chosen": -446.12451171875,
"logps/rejected": -491.7059326171875,
"loss": 0.5888,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.106739044189453,
"rewards/margins": 0.3179362714290619,
"rewards/rejected": -2.4246749877929688,
"step": 400
},
{
"epoch": 0.8550434201736807,
"eval_logits/chosen": -0.6434622406959534,
"eval_logits/rejected": -0.6439588069915771,
"eval_logps/chosen": -418.7559814453125,
"eval_logps/rejected": -459.1051330566406,
"eval_loss": 0.6088488698005676,
"eval_rewards/accuracies": 0.6829268336296082,
"eval_rewards/chosen": -1.6320796012878418,
"eval_rewards/margins": 0.34643232822418213,
"eval_rewards/rejected": -1.9785118103027344,
"eval_runtime": 373.8135,
"eval_samples_per_second": 5.246,
"eval_steps_per_second": 0.329,
"step": 400
},
{
"epoch": 0.857181028724115,
"grad_norm": 13.938068351492317,
"learning_rate": 5.9702234071631e-08,
"logits/chosen": -0.6074206233024597,
"logits/rejected": -0.57494056224823,
"logps/chosen": -432.34869384765625,
"logps/rejected": -481.2635498046875,
"loss": 0.624,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.724292278289795,
"rewards/margins": 0.546868622303009,
"rewards/rejected": -2.2711610794067383,
"step": 401
},
{
"epoch": 0.859318637274549,
"grad_norm": 14.889197610496133,
"learning_rate": 5.794230324769517e-08,
"logits/chosen": -0.6924403309822083,
"logits/rejected": -0.695598304271698,
"logps/chosen": -430.3138732910156,
"logps/rejected": -472.5992431640625,
"loss": 0.6165,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.809744119644165,
"rewards/margins": 0.3945625424385071,
"rewards/rejected": -2.2043066024780273,
"step": 402
},
{
"epoch": 0.8614562458249833,
"grad_norm": 10.24820423132373,
"learning_rate": 5.620710549772295e-08,
"logits/chosen": -0.6588191390037537,
"logits/rejected": -0.6449538469314575,
"logps/chosen": -391.6925354003906,
"logps/rejected": -442.3234558105469,
"loss": 0.6086,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5062448978424072,
"rewards/margins": 0.3859245777130127,
"rewards/rejected": -1.89216947555542,
"step": 403
},
{
"epoch": 0.8635938543754175,
"grad_norm": 11.857754301901029,
"learning_rate": 5.44967379058161e-08,
"logits/chosen": -0.7503631114959717,
"logits/rejected": -0.7300340533256531,
"logps/chosen": -386.56072998046875,
"logps/rejected": -396.8390808105469,
"loss": 0.5982,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.6606768369674683,
"rewards/margins": 0.09765629470348358,
"rewards/rejected": -1.7583332061767578,
"step": 404
},
{
"epoch": 0.8657314629258517,
"grad_norm": 10.569416708562631,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -0.7464025020599365,
"logits/rejected": -0.7271702885627747,
"logps/chosen": -413.1631774902344,
"logps/rejected": -470.1753234863281,
"loss": 0.581,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5785975456237793,
"rewards/margins": 0.4657081961631775,
"rewards/rejected": -2.0443055629730225,
"step": 405
},
{
"epoch": 0.8678690714762859,
"grad_norm": 11.592126458293672,
"learning_rate": 5.11508745810284e-08,
"logits/chosen": -0.667618453502655,
"logits/rejected": -0.6744921207427979,
"logps/chosen": -404.13824462890625,
"logps/rejected": -412.8828430175781,
"loss": 0.6282,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6986844539642334,
"rewards/margins": 0.08317200094461441,
"rewards/rejected": -1.7818565368652344,
"step": 406
},
{
"epoch": 0.8700066800267201,
"grad_norm": 12.223879763686206,
"learning_rate": 4.951556604879048e-08,
"logits/chosen": -0.6467772126197815,
"logits/rejected": -0.6247937679290771,
"logps/chosen": -442.8312683105469,
"logps/rejected": -498.9490051269531,
"loss": 0.6112,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.812768816947937,
"rewards/margins": 0.4102476239204407,
"rewards/rejected": -2.2230165004730225,
"step": 407
},
{
"epoch": 0.8721442885771543,
"grad_norm": 13.609539677706314,
"learning_rate": 4.7905462065429946e-08,
"logits/chosen": -0.838919997215271,
"logits/rejected": -0.8245532512664795,
"logps/chosen": -415.890869140625,
"logps/rejected": -435.3166198730469,
"loss": 0.6788,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.5928852558135986,
"rewards/margins": 0.25430333614349365,
"rewards/rejected": -1.8471887111663818,
"step": 408
},
{
"epoch": 0.8742818971275885,
"grad_norm": 10.454604993981434,
"learning_rate": 4.6320652716067555e-08,
"logits/chosen": -0.7226736545562744,
"logits/rejected": -0.7249311208724976,
"logps/chosen": -406.7791748046875,
"logps/rejected": -448.5416259765625,
"loss": 0.609,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5601541996002197,
"rewards/margins": 0.37674978375434875,
"rewards/rejected": -1.936903953552246,
"step": 409
},
{
"epoch": 0.8764195056780227,
"grad_norm": 10.922415025272509,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -0.7072638869285583,
"logits/rejected": -0.6469031572341919,
"logps/chosen": -437.16253662109375,
"logps/rejected": -464.5462951660156,
"loss": 0.6033,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.673638105392456,
"rewards/margins": 0.27518972754478455,
"rewards/rejected": -1.948827862739563,
"step": 410
},
{
"epoch": 0.878557114228457,
"grad_norm": 11.15308213585594,
"learning_rate": 4.322727117869951e-08,
"logits/chosen": -0.5786024332046509,
"logits/rejected": -0.5698223114013672,
"logps/chosen": -387.2678527832031,
"logps/rejected": -420.85101318359375,
"loss": 0.6038,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5769679546356201,
"rewards/margins": 0.32589417695999146,
"rewards/rejected": -1.9028621912002563,
"step": 411
},
{
"epoch": 0.8806947227788912,
"grad_norm": 14.488282375387797,
"learning_rate": 4.17188720650119e-08,
"logits/chosen": -0.7604373097419739,
"logits/rejected": -0.7526075839996338,
"logps/chosen": -510.45159912109375,
"logps/rejected": -509.4095458984375,
"loss": 0.6893,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.090540885925293,
"rewards/margins": 0.033110879361629486,
"rewards/rejected": -2.1236515045166016,
"step": 412
},
{
"epoch": 0.8828323313293254,
"grad_norm": 11.024440296301012,
"learning_rate": 4.023611372427471e-08,
"logits/chosen": -0.7349828481674194,
"logits/rejected": -0.7459964156150818,
"logps/chosen": -388.6877746582031,
"logps/rejected": -420.792236328125,
"loss": 0.5967,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7269947528839111,
"rewards/margins": 0.28061679005622864,
"rewards/rejected": -2.0076115131378174,
"step": 413
},
{
"epoch": 0.8849699398797595,
"grad_norm": 11.21797390523024,
"learning_rate": 3.877907911663542e-08,
"logits/chosen": -0.6687692403793335,
"logits/rejected": -0.6710121631622314,
"logps/chosen": -361.7718200683594,
"logps/rejected": -406.8067321777344,
"loss": 0.5766,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2828166484832764,
"rewards/margins": 0.4076838493347168,
"rewards/rejected": -1.6905003786087036,
"step": 414
},
{
"epoch": 0.8871075484301937,
"grad_norm": 11.310104114342186,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -0.7112718820571899,
"logits/rejected": -0.6793174743652344,
"logps/chosen": -395.9449768066406,
"logps/rejected": -415.42108154296875,
"loss": 0.6427,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.5930345058441162,
"rewards/margins": 0.189566969871521,
"rewards/rejected": -1.7826014757156372,
"step": 415
},
{
"epoch": 0.8892451569806279,
"grad_norm": 11.184536783628738,
"learning_rate": 3.594250574048058e-08,
"logits/chosen": -0.6613335609436035,
"logits/rejected": -0.6428050994873047,
"logps/chosen": -367.02874755859375,
"logps/rejected": -389.9844970703125,
"loss": 0.6174,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.537019968032837,
"rewards/margins": 0.1499500423669815,
"rewards/rejected": -1.6869698762893677,
"step": 416
},
{
"epoch": 0.8913827655310621,
"grad_norm": 12.205433845637979,
"learning_rate": 3.456312567789793e-08,
"logits/chosen": -0.7070876955986023,
"logits/rejected": -0.7160503268241882,
"logps/chosen": -469.0753173828125,
"logps/rejected": -494.7930908203125,
"loss": 0.6228,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9284939765930176,
"rewards/margins": 0.23351570963859558,
"rewards/rejected": -2.1620097160339355,
"step": 417
},
{
"epoch": 0.8935203740814963,
"grad_norm": 11.816949999049964,
"learning_rate": 3.3209786751399184e-08,
"logits/chosen": -0.6653708815574646,
"logits/rejected": -0.6532600522041321,
"logps/chosen": -464.9654541015625,
"logps/rejected": -504.6747741699219,
"loss": 0.5449,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9524656534194946,
"rewards/margins": 0.3868550658226013,
"rewards/rejected": -2.339320659637451,
"step": 418
},
{
"epoch": 0.8956579826319305,
"grad_norm": 10.879487484866441,
"learning_rate": 3.188256468013139e-08,
"logits/chosen": -0.6497898101806641,
"logits/rejected": -0.6454100608825684,
"logps/chosen": -478.6482238769531,
"logps/rejected": -530.9612426757812,
"loss": 0.5632,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8596200942993164,
"rewards/margins": 0.5317977070808411,
"rewards/rejected": -2.3914177417755127,
"step": 419
},
{
"epoch": 0.8977955911823647,
"grad_norm": 11.880007198303698,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -0.6183308959007263,
"logits/rejected": -0.6003840565681458,
"logps/chosen": -459.9405212402344,
"logps/rejected": -505.4646911621094,
"loss": 0.6181,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.9765161275863647,
"rewards/margins": 0.4471958875656128,
"rewards/rejected": -2.4237117767333984,
"step": 420
},
{
"epoch": 0.899933199732799,
"grad_norm": 13.426952468351809,
"learning_rate": 2.9306766669548457e-08,
"logits/chosen": -0.7094901204109192,
"logits/rejected": -0.6653531193733215,
"logps/chosen": -466.24029541015625,
"logps/rejected": -487.9982604980469,
"loss": 0.5993,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.9488886594772339,
"rewards/margins": 0.351326048374176,
"rewards/rejected": -2.3002147674560547,
"step": 421
},
{
"epoch": 0.9020708082832332,
"grad_norm": 11.556511924801233,
"learning_rate": 2.805833484581621e-08,
"logits/chosen": -0.8073502779006958,
"logits/rejected": -0.7438942790031433,
"logps/chosen": -459.5665588378906,
"logps/rejected": -462.51519775390625,
"loss": 0.5975,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.806351900100708,
"rewards/margins": 0.16977502405643463,
"rewards/rejected": -1.976126790046692,
"step": 422
},
{
"epoch": 0.9042084168336674,
"grad_norm": 12.210595464753169,
"learning_rate": 2.6836308100417872e-08,
"logits/chosen": -0.6977376341819763,
"logits/rejected": -0.6720814108848572,
"logps/chosen": -427.6357727050781,
"logps/rejected": -460.1345520019531,
"loss": 0.5831,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.711733102798462,
"rewards/margins": 0.44918665289878845,
"rewards/rejected": -2.1609199047088623,
"step": 423
},
{
"epoch": 0.9063460253841016,
"grad_norm": 9.940416325858004,
"learning_rate": 2.5640754805600128e-08,
"logits/chosen": -0.7047473788261414,
"logits/rejected": -0.7050879597663879,
"logps/chosen": -355.5130615234375,
"logps/rejected": -383.0091552734375,
"loss": 0.6143,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.397302508354187,
"rewards/margins": 0.2378145009279251,
"rewards/rejected": -1.6351170539855957,
"step": 424
},
{
"epoch": 0.9084836339345357,
"grad_norm": 12.20154656454581,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -0.7828183174133301,
"logits/rejected": -0.7872889041900635,
"logps/chosen": -358.20751953125,
"logps/rejected": -394.5291748046875,
"loss": 0.5956,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3747402429580688,
"rewards/margins": 0.26737523078918457,
"rewards/rejected": -1.6421154737472534,
"step": 425
},
{
"epoch": 0.9106212424849699,
"grad_norm": 10.941418508752523,
"learning_rate": 2.3329334647018694e-08,
"logits/chosen": -0.6170888543128967,
"logits/rejected": -0.5692444443702698,
"logps/chosen": -472.42864990234375,
"logps/rejected": -516.1029052734375,
"loss": 0.5838,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.0783095359802246,
"rewards/margins": 0.4643981158733368,
"rewards/rejected": -2.542707681655884,
"step": 426
},
{
"epoch": 0.9127588510354041,
"grad_norm": 11.303024362125633,
"learning_rate": 2.2213597106929605e-08,
"logits/chosen": -0.5531542301177979,
"logits/rejected": -0.5305842161178589,
"logps/chosen": -422.59100341796875,
"logps/rejected": -460.8222961425781,
"loss": 0.6129,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7473095655441284,
"rewards/margins": 0.3811667263507843,
"rewards/rejected": -2.12847638130188,
"step": 427
},
{
"epoch": 0.9148964595858383,
"grad_norm": 14.94800877117672,
"learning_rate": 2.1124591657534774e-08,
"logits/chosen": -0.6627920866012573,
"logits/rejected": -0.6768360733985901,
"logps/chosen": -437.7267150878906,
"logps/rejected": -494.59075927734375,
"loss": 0.6108,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8751461505889893,
"rewards/margins": 0.4180339574813843,
"rewards/rejected": -2.293180227279663,
"step": 428
},
{
"epoch": 0.9170340681362725,
"grad_norm": 11.706241248792244,
"learning_rate": 2.0062379228555525e-08,
"logits/chosen": -0.6479263305664062,
"logits/rejected": -0.6203778386116028,
"logps/chosen": -371.62310791015625,
"logps/rejected": -380.03436279296875,
"loss": 0.6172,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4537248611450195,
"rewards/margins": 0.1625545471906662,
"rewards/rejected": -1.6162794828414917,
"step": 429
},
{
"epoch": 0.9191716766867067,
"grad_norm": 11.343840156206866,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -0.7142120003700256,
"logits/rejected": -0.7347142696380615,
"logps/chosen": -411.76312255859375,
"logps/rejected": -476.20697021484375,
"loss": 0.5984,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6978678703308105,
"rewards/margins": 0.46169888973236084,
"rewards/rejected": -2.159566640853882,
"step": 430
},
{
"epoch": 0.921309285237141,
"grad_norm": 10.191297953603781,
"learning_rate": 1.8018569652073378e-08,
"logits/chosen": -0.5895026922225952,
"logits/rejected": -0.5850787162780762,
"logps/chosen": -406.5594482421875,
"logps/rejected": -485.04217529296875,
"loss": 0.593,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.616127371788025,
"rewards/margins": 0.5671988129615784,
"rewards/rejected": -2.183326244354248,
"step": 431
},
{
"epoch": 0.9234468937875752,
"grad_norm": 12.93643748441664,
"learning_rate": 1.7037086855465898e-08,
"logits/chosen": -0.7007228136062622,
"logits/rejected": -0.6858587265014648,
"logps/chosen": -412.77496337890625,
"logps/rejected": -458.0037841796875,
"loss": 0.6264,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7415974140167236,
"rewards/margins": 0.3250362277030945,
"rewards/rejected": -2.066633701324463,
"step": 432
},
{
"epoch": 0.9255845023380094,
"grad_norm": 12.402383758119484,
"learning_rate": 1.6082625774666792e-08,
"logits/chosen": -0.6870225667953491,
"logits/rejected": -0.6988283395767212,
"logps/chosen": -401.1902770996094,
"logps/rejected": -415.27093505859375,
"loss": 0.5975,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.706886649131775,
"rewards/margins": 0.06286803632974625,
"rewards/rejected": -1.7697547674179077,
"step": 433
},
{
"epoch": 0.9277221108884436,
"grad_norm": 11.598555340742855,
"learning_rate": 1.5155239811656562e-08,
"logits/chosen": -0.7391936182975769,
"logits/rejected": -0.7346464395523071,
"logps/chosen": -362.8863525390625,
"logps/rejected": -407.58392333984375,
"loss": 0.5696,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4570672512054443,
"rewards/margins": 0.35567349195480347,
"rewards/rejected": -1.8127408027648926,
"step": 434
},
{
"epoch": 0.9298597194388778,
"grad_norm": 12.334039197101353,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -0.7256093621253967,
"logits/rejected": -0.7030697464942932,
"logps/chosen": -498.5211181640625,
"logps/rejected": -495.9085693359375,
"loss": 0.6381,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0069806575775146,
"rewards/margins": 0.12008260935544968,
"rewards/rejected": -2.127063274383545,
"step": 435
},
{
"epoch": 0.931997327989312,
"grad_norm": 10.831899946588043,
"learning_rate": 1.3381899269774289e-08,
"logits/chosen": -0.7507193088531494,
"logits/rejected": -0.7519603967666626,
"logps/chosen": -359.82000732421875,
"logps/rejected": -395.96771240234375,
"loss": 0.5773,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.372948408126831,
"rewards/margins": 0.3809196352958679,
"rewards/rejected": -1.7538681030273438,
"step": 436
},
{
"epoch": 0.9341349365397461,
"grad_norm": 14.210280355388763,
"learning_rate": 1.253604390908819e-08,
"logits/chosen": -0.5923041701316833,
"logits/rejected": -0.6011568307876587,
"logps/chosen": -345.30633544921875,
"logps/rejected": -392.51751708984375,
"loss": 0.6674,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.585827350616455,
"rewards/margins": 0.3826131224632263,
"rewards/rejected": -1.968440294265747,
"step": 437
},
{
"epoch": 0.9362725450901803,
"grad_norm": 9.851284631791968,
"learning_rate": 1.1717462097011855e-08,
"logits/chosen": -0.6331924796104431,
"logits/rejected": -0.6489231586456299,
"logps/chosen": -429.216796875,
"logps/rejected": -477.13836669921875,
"loss": 0.5764,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8334829807281494,
"rewards/margins": 0.3744773864746094,
"rewards/rejected": -2.207960605621338,
"step": 438
},
{
"epoch": 0.9384101536406145,
"grad_norm": 11.272588511911392,
"learning_rate": 1.0926199633097154e-08,
"logits/chosen": -0.5822413563728333,
"logits/rejected": -0.5487803220748901,
"logps/chosen": -428.51934814453125,
"logps/rejected": -487.8909606933594,
"loss": 0.6055,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6083500385284424,
"rewards/margins": 0.4286819398403168,
"rewards/rejected": -2.037031888961792,
"step": 439
},
{
"epoch": 0.9405477621910487,
"grad_norm": 10.801295437898501,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -0.7505050897598267,
"logits/rejected": -0.7208874225616455,
"logps/chosen": -511.32110595703125,
"logps/rejected": -563.9736938476562,
"loss": 0.5874,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.1491966247558594,
"rewards/margins": 0.5160467624664307,
"rewards/rejected": -2.665243625640869,
"step": 440
},
{
"epoch": 0.942685370741483,
"grad_norm": 12.022377343895434,
"learning_rate": 9.425808302913728e-09,
"logits/chosen": -0.6826910972595215,
"logits/rejected": -0.7009281516075134,
"logps/chosen": -396.803466796875,
"logps/rejected": -475.8189697265625,
"loss": 0.5696,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5288541316986084,
"rewards/margins": 0.5459466576576233,
"rewards/rejected": -2.074800968170166,
"step": 441
},
{
"epoch": 0.9448229792919172,
"grad_norm": 11.529829218714097,
"learning_rate": 8.716763383355862e-09,
"logits/chosen": -0.6541940569877625,
"logits/rejected": -0.6755858063697815,
"logps/chosen": -480.7030944824219,
"logps/rejected": -526.9130249023438,
"loss": 0.5949,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.1652467250823975,
"rewards/margins": 0.42291441559791565,
"rewards/rejected": -2.588160991668701,
"step": 442
},
{
"epoch": 0.9469605878423514,
"grad_norm": 11.296144808610602,
"learning_rate": 8.035205700685165e-09,
"logits/chosen": -0.5620754361152649,
"logits/rejected": -0.5832556486129761,
"logps/chosen": -406.50115966796875,
"logps/rejected": -483.1033935546875,
"loss": 0.5998,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7801560163497925,
"rewards/margins": 0.6258376240730286,
"rewards/rejected": -2.4059934616088867,
"step": 443
},
{
"epoch": 0.9490981963927856,
"grad_norm": 14.459436916194253,
"learning_rate": 7.381173387970397e-09,
"logits/chosen": -0.6875967979431152,
"logits/rejected": -0.7037211060523987,
"logps/chosen": -387.79193115234375,
"logps/rejected": -406.17730712890625,
"loss": 0.625,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.775752067565918,
"rewards/margins": 0.16077642142772675,
"rewards/rejected": -1.9365284442901611,
"step": 444
},
{
"epoch": 0.9512358049432198,
"grad_norm": 12.667943463380384,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -0.6868771314620972,
"logits/rejected": -0.6806486248970032,
"logps/chosen": -391.65704345703125,
"logps/rejected": -439.91534423828125,
"loss": 0.616,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5933175086975098,
"rewards/margins": 0.549602746963501,
"rewards/rejected": -2.14292049407959,
"step": 445
},
{
"epoch": 0.953373413493654,
"grad_norm": 13.999625605626278,
"learning_rate": 6.15582970243117e-09,
"logits/chosen": -0.679996132850647,
"logits/rejected": -0.6954419612884521,
"logps/chosen": -411.924072265625,
"logps/rejected": -465.689453125,
"loss": 0.5689,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5961847305297852,
"rewards/margins": 0.44558507204055786,
"rewards/rejected": -2.0417697429656982,
"step": 446
},
{
"epoch": 0.9555110220440882,
"grad_norm": 10.363695148382087,
"learning_rate": 5.5845868874357385e-09,
"logits/chosen": -0.6567386388778687,
"logits/rejected": -0.6833846569061279,
"logps/chosen": -491.9205627441406,
"logps/rejected": -569.992431640625,
"loss": 0.5532,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7513771057128906,
"rewards/margins": 0.6100505590438843,
"rewards/rejected": -2.3614273071289062,
"step": 447
},
{
"epoch": 0.9576486305945224,
"grad_norm": 13.618449735155838,
"learning_rate": 5.0410065542185184e-09,
"logits/chosen": -0.5561550855636597,
"logits/rejected": -0.5477365851402283,
"logps/chosen": -404.7331848144531,
"logps/rejected": -456.8463134765625,
"loss": 0.5897,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.8092013597488403,
"rewards/margins": 0.39521071314811707,
"rewards/rejected": -2.2044119834899902,
"step": 448
},
{
"epoch": 0.9597862391449565,
"grad_norm": 11.691098922849898,
"learning_rate": 4.5251191160326495e-09,
"logits/chosen": -0.7571395039558411,
"logits/rejected": -0.6862713098526001,
"logps/chosen": -404.239501953125,
"logps/rejected": -430.745849609375,
"loss": 0.6223,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.5349277257919312,
"rewards/margins": 0.27732717990875244,
"rewards/rejected": -1.8122549057006836,
"step": 449
},
{
"epoch": 0.9619238476953907,
"grad_norm": 11.212537524360101,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -0.6666488647460938,
"logits/rejected": -0.6362468600273132,
"logps/chosen": -390.62713623046875,
"logps/rejected": -432.0528564453125,
"loss": 0.5807,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.528037428855896,
"rewards/margins": 0.3787933588027954,
"rewards/rejected": -1.9068307876586914,
"step": 450
},
{
"epoch": 0.964061456245825,
"grad_norm": 11.304696270661244,
"learning_rate": 3.5765368290813223e-09,
"logits/chosen": -0.6946466565132141,
"logits/rejected": -0.7254693508148193,
"logps/chosen": -417.11187744140625,
"logps/rejected": -472.14288330078125,
"loss": 0.6012,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.704296588897705,
"rewards/margins": 0.47184205055236816,
"rewards/rejected": -2.1761388778686523,
"step": 451
},
{
"epoch": 0.9661990647962592,
"grad_norm": 12.23326718620082,
"learning_rate": 3.1438950533786977e-09,
"logits/chosen": -0.727628767490387,
"logits/rejected": -0.7244228720664978,
"logps/chosen": -368.25653076171875,
"logps/rejected": -406.3876647949219,
"loss": 0.6045,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.741332769393921,
"rewards/margins": 0.22878167033195496,
"rewards/rejected": -1.9701144695281982,
"step": 452
},
{
"epoch": 0.9683366733466934,
"grad_norm": 12.889743033859292,
"learning_rate": 2.739052315863355e-09,
"logits/chosen": -0.7463970184326172,
"logits/rejected": -0.7229277491569519,
"logps/chosen": -395.8512878417969,
"logps/rejected": -452.2123718261719,
"loss": 0.5944,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4541352987289429,
"rewards/margins": 0.4854976534843445,
"rewards/rejected": -1.9396328926086426,
"step": 453
},
{
"epoch": 0.9704742818971276,
"grad_norm": 11.397457544749724,
"learning_rate": 2.3620312674367816e-09,
"logits/chosen": -0.7733277678489685,
"logits/rejected": -0.761780858039856,
"logps/chosen": -469.01544189453125,
"logps/rejected": -496.7162780761719,
"loss": 0.6331,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8822633028030396,
"rewards/margins": 0.15627171099185944,
"rewards/rejected": -2.0385348796844482,
"step": 454
},
{
"epoch": 0.9726118904475618,
"grad_norm": 12.018897554978574,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -0.6989326477050781,
"logits/rejected": -0.7310012578964233,
"logps/chosen": -407.86639404296875,
"logps/rejected": -467.86798095703125,
"loss": 0.5709,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.7583638429641724,
"rewards/margins": 0.5012065172195435,
"rewards/rejected": -2.2595701217651367,
"step": 455
},
{
"epoch": 0.974749498997996,
"grad_norm": 12.451774147613516,
"learning_rate": 1.6915370571756181e-09,
"logits/chosen": -0.7267682552337646,
"logits/rejected": -0.7152563333511353,
"logps/chosen": -450.92230224609375,
"logps/rejected": -483.6854248046875,
"loss": 0.6282,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.7722840309143066,
"rewards/margins": 0.15732887387275696,
"rewards/rejected": -1.9296131134033203,
"step": 456
},
{
"epoch": 0.9768871075484302,
"grad_norm": 12.126324151502713,
"learning_rate": 1.3981014094099353e-09,
"logits/chosen": -0.7544288635253906,
"logits/rejected": -0.7525961995124817,
"logps/chosen": -397.43109130859375,
"logps/rejected": -431.8571472167969,
"loss": 0.5846,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4742791652679443,
"rewards/margins": 0.3458634316921234,
"rewards/rejected": -1.8201426267623901,
"step": 457
},
{
"epoch": 0.9790247160988644,
"grad_norm": 9.907495266444734,
"learning_rate": 1.1325624767719588e-09,
"logits/chosen": -0.6586907505989075,
"logits/rejected": -0.6237790584564209,
"logps/chosen": -395.1099853515625,
"logps/rejected": -438.49237060546875,
"loss": 0.5992,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5770848989486694,
"rewards/margins": 0.40755948424339294,
"rewards/rejected": -1.9846441745758057,
"step": 458
},
{
"epoch": 0.9811623246492986,
"grad_norm": 13.123065251970033,
"learning_rate": 8.949351161324225e-10,
"logits/chosen": -0.6515368223190308,
"logits/rejected": -0.6513477563858032,
"logps/chosen": -411.0286560058594,
"logps/rejected": -474.112548828125,
"loss": 0.621,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.7529451847076416,
"rewards/margins": 0.5362708568572998,
"rewards/rejected": -2.2892158031463623,
"step": 459
},
{
"epoch": 0.9832999331997327,
"grad_norm": 12.071426108415315,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -0.7456957697868347,
"logits/rejected": -0.6752879023551941,
"logps/chosen": -450.5098876953125,
"logps/rejected": -455.85577392578125,
"loss": 0.626,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8361914157867432,
"rewards/margins": 0.20122388005256653,
"rewards/rejected": -2.037415027618408,
"step": 460
},
{
"epoch": 0.985437541750167,
"grad_norm": 11.748757454977515,
"learning_rate": 5.034667293427053e-10,
"logits/chosen": -0.7174670696258545,
"logits/rejected": -0.6987491250038147,
"logps/chosen": -434.594482421875,
"logps/rejected": -480.0499267578125,
"loss": 0.6146,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.7917670011520386,
"rewards/margins": 0.3851429224014282,
"rewards/rejected": -2.176909923553467,
"step": 461
},
{
"epoch": 0.9875751503006012,
"grad_norm": 20.824968125195323,
"learning_rate": 3.4964760580069585e-10,
"logits/chosen": -0.555869460105896,
"logits/rejected": -0.5152798891067505,
"logps/chosen": -407.359375,
"logps/rejected": -415.696533203125,
"loss": 0.6489,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.7928351163864136,
"rewards/margins": 0.13394100964069366,
"rewards/rejected": -1.9267761707305908,
"step": 462
},
{
"epoch": 0.9897127588510354,
"grad_norm": 11.441495646450653,
"learning_rate": 2.2378385824833866e-10,
"logits/chosen": -0.7355363965034485,
"logits/rejected": -0.727141261100769,
"logps/chosen": -411.1090087890625,
"logps/rejected": -475.9149475097656,
"loss": 0.6229,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.861409068107605,
"rewards/margins": 0.488926500082016,
"rewards/rejected": -2.3503353595733643,
"step": 463
},
{
"epoch": 0.9918503674014696,
"grad_norm": 11.714079308527252,
"learning_rate": 1.2588252874673466e-10,
"logits/chosen": -0.8587902784347534,
"logits/rejected": -0.8111391663551331,
"logps/chosen": -470.9283447265625,
"logps/rejected": -455.00372314453125,
"loss": 0.6381,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.885197639465332,
"rewards/margins": 0.07718580961227417,
"rewards/rejected": -1.9623833894729614,
"step": 464
},
{
"epoch": 0.9939879759519038,
"grad_norm": 16.820068325011835,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -0.5276237726211548,
"logits/rejected": -0.5462942123413086,
"logps/chosen": -459.17626953125,
"logps/rejected": -478.39056396484375,
"loss": 0.6987,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8299469947814941,
"rewards/margins": 0.1971490979194641,
"rewards/rejected": -2.0270960330963135,
"step": 465
},
{
"epoch": 0.996125584502338,
"grad_norm": 10.981775368002003,
"learning_rate": 1.3987469365095429e-11,
"logits/chosen": -0.787868082523346,
"logits/rejected": -0.8146266937255859,
"logps/chosen": -463.4134826660156,
"logps/rejected": -492.61859130859375,
"loss": 0.5814,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6699291467666626,
"rewards/margins": 0.16003668308258057,
"rewards/rejected": -1.8299658298492432,
"step": 466
},
{
"epoch": 0.9982631930527722,
"grad_norm": 10.869184778690476,
"learning_rate": 0.0,
"logits/chosen": -0.7284511923789978,
"logits/rejected": -0.7266198992729187,
"logps/chosen": -403.50836181640625,
"logps/rejected": -409.22369384765625,
"loss": 0.6449,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5633559226989746,
"rewards/margins": 0.2905767261981964,
"rewards/rejected": -1.8539327383041382,
"step": 467
},
{
"epoch": 0.9982631930527722,
"step": 467,
"total_flos": 0.0,
"train_loss": 0.6321173631915189,
"train_runtime": 21471.9268,
"train_samples_per_second": 2.789,
"train_steps_per_second": 0.022
}
],
"logging_steps": 1,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}