zephyr-7b-dpo-full / trainer_state.json
wzhouad's picture
Model save
3e15dd4 verified
raw
history blame
No virus
46 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9973828840617638,
"eval_steps": 10000,
"global_step": 954,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": 0.17704486846923828,
"logits/rejected": 0.25409135222435,
"logps/chosen": -354.4068603515625,
"logps/rejected": -305.2366638183594,
"loss": 0.1821,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -6.60312725813128e-05,
"rewards/margins": 0.00012125837383791804,
"rewards/rejected": -0.00018728969735093415,
"step": 10
},
{
"epoch": 0.04,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": 0.07091161608695984,
"logits/rejected": 0.1985362321138382,
"logps/chosen": -316.65069580078125,
"logps/rejected": -276.1200866699219,
"loss": 0.182,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0008458361262455583,
"rewards/margins": 0.0016920112539082766,
"rewards/rejected": -0.0008461751276627183,
"step": 20
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": 0.17787829041481018,
"logits/rejected": 0.2488478720188141,
"logps/chosen": -294.9706115722656,
"logps/rejected": -298.59521484375,
"loss": 0.1822,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.700423985719681e-05,
"rewards/margins": 0.0029355171136558056,
"rewards/rejected": -0.0029725211206823587,
"step": 30
},
{
"epoch": 0.08,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": 0.09609868377447128,
"logits/rejected": 0.21795693039894104,
"logps/chosen": -347.44097900390625,
"logps/rejected": -320.9972839355469,
"loss": 0.1877,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0013125470140948892,
"rewards/margins": 0.00661453977227211,
"rewards/rejected": -0.005301993805915117,
"step": 40
},
{
"epoch": 0.1,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": 0.1497882902622223,
"logits/rejected": 0.240590900182724,
"logps/chosen": -311.1229553222656,
"logps/rejected": -286.51702880859375,
"loss": 0.1814,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.005703258328139782,
"rewards/margins": 0.022644545882940292,
"rewards/rejected": -0.02834780514240265,
"step": 50
},
{
"epoch": 0.13,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": 0.13869214057922363,
"logits/rejected": 0.28307411074638367,
"logps/chosen": -295.9754638671875,
"logps/rejected": -281.43798828125,
"loss": 0.1766,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.03096725046634674,
"rewards/margins": 0.028959080576896667,
"rewards/rejected": -0.059926338493824005,
"step": 60
},
{
"epoch": 0.15,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": 0.18460798263549805,
"logits/rejected": 0.2718513607978821,
"logps/chosen": -335.46148681640625,
"logps/rejected": -330.33404541015625,
"loss": 0.174,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.057377688586711884,
"rewards/margins": 0.05648452043533325,
"rewards/rejected": -0.11386220157146454,
"step": 70
},
{
"epoch": 0.17,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": 0.29816848039627075,
"logits/rejected": 0.4011983871459961,
"logps/chosen": -330.4580383300781,
"logps/rejected": -311.96490478515625,
"loss": 0.159,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.11794394254684448,
"rewards/margins": 0.13102997839450836,
"rewards/rejected": -0.24897389113903046,
"step": 80
},
{
"epoch": 0.19,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": 0.2283201515674591,
"logits/rejected": 0.37335914373397827,
"logps/chosen": -358.6737365722656,
"logps/rejected": -304.0804138183594,
"loss": 0.1421,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.21732211112976074,
"rewards/margins": 0.15273679792881012,
"rewards/rejected": -0.37005892395973206,
"step": 90
},
{
"epoch": 0.21,
"learning_rate": 9.999463737538052e-07,
"logits/chosen": 0.2938156723976135,
"logits/rejected": 0.46553492546081543,
"logps/chosen": -361.78338623046875,
"logps/rejected": -343.25750732421875,
"loss": 0.1217,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.27221935987472534,
"rewards/margins": 0.23653486371040344,
"rewards/rejected": -0.5087541937828064,
"step": 100
},
{
"epoch": 0.23,
"learning_rate": 9.993432105822034e-07,
"logits/chosen": 0.31155580282211304,
"logits/rejected": 0.3508353531360626,
"logps/chosen": -353.184814453125,
"logps/rejected": -366.32720947265625,
"loss": 0.106,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.40565404295921326,
"rewards/margins": 0.2631165683269501,
"rewards/rejected": -0.6687706708908081,
"step": 110
},
{
"epoch": 0.25,
"learning_rate": 9.980706626858607e-07,
"logits/chosen": 0.26659709215164185,
"logits/rejected": 0.3288796842098236,
"logps/chosen": -374.50274658203125,
"logps/rejected": -403.8424377441406,
"loss": 0.0951,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5394914150238037,
"rewards/margins": 0.28696924448013306,
"rewards/rejected": -0.8264607191085815,
"step": 120
},
{
"epoch": 0.27,
"learning_rate": 9.961304359538434e-07,
"logits/chosen": 0.1616436094045639,
"logits/rejected": 0.2970871031284332,
"logps/chosen": -396.555419921875,
"logps/rejected": -362.3848876953125,
"loss": 0.0934,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5805934071540833,
"rewards/margins": 0.19475166499614716,
"rewards/rejected": -0.775344967842102,
"step": 130
},
{
"epoch": 0.29,
"learning_rate": 9.935251313189563e-07,
"logits/chosen": 0.1485656201839447,
"logits/rejected": 0.2714545428752899,
"logps/chosen": -384.0659484863281,
"logps/rejected": -346.6048278808594,
"loss": 0.0933,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5202253460884094,
"rewards/margins": 0.24675369262695312,
"rewards/rejected": -0.766978919506073,
"step": 140
},
{
"epoch": 0.31,
"learning_rate": 9.902582412711118e-07,
"logits/chosen": 0.12988325953483582,
"logits/rejected": 0.1523539423942566,
"logps/chosen": -379.16839599609375,
"logps/rejected": -395.9466552734375,
"loss": 0.1019,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4386775493621826,
"rewards/margins": 0.37129276990890503,
"rewards/rejected": -0.8099702596664429,
"step": 150
},
{
"epoch": 0.33,
"learning_rate": 9.86334145175542e-07,
"logits/chosen": 0.06655962765216827,
"logits/rejected": 0.09024105966091156,
"logps/chosen": -341.7105407714844,
"logps/rejected": -360.19805908203125,
"loss": 0.0937,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3964901566505432,
"rewards/margins": 0.3985019028186798,
"rewards/rejected": -0.7949920892715454,
"step": 160
},
{
"epoch": 0.36,
"learning_rate": 9.817581034021272e-07,
"logits/chosen": 0.16973164677619934,
"logits/rejected": 0.21836213767528534,
"logps/chosen": -398.22369384765625,
"logps/rejected": -417.8206481933594,
"loss": 0.081,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6185532808303833,
"rewards/margins": 0.4811604917049408,
"rewards/rejected": -1.0997138023376465,
"step": 170
},
{
"epoch": 0.38,
"learning_rate": 9.765362502737097e-07,
"logits/chosen": 0.09212584793567657,
"logits/rejected": 0.23974208533763885,
"logps/chosen": -388.64910888671875,
"logps/rejected": -411.5782775878906,
"loss": 0.0713,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.6261709928512573,
"rewards/margins": 0.4908596873283386,
"rewards/rejected": -1.1170307397842407,
"step": 180
},
{
"epoch": 0.4,
"learning_rate": 9.706755858428485e-07,
"logits/chosen": 0.1811675727367401,
"logits/rejected": 0.27236208319664,
"logps/chosen": -419.11376953125,
"logps/rejected": -437.33843994140625,
"loss": 0.0681,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8445426225662231,
"rewards/margins": 0.4015916883945465,
"rewards/rejected": -1.2461342811584473,
"step": 190
},
{
"epoch": 0.42,
"learning_rate": 9.641839665080363e-07,
"logits/chosen": 0.14256766438484192,
"logits/rejected": 0.2711044251918793,
"logps/chosen": -414.55975341796875,
"logps/rejected": -416.9037170410156,
"loss": 0.0675,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7406997680664062,
"rewards/margins": 0.48706990480422974,
"rewards/rejected": -1.2277696132659912,
"step": 200
},
{
"epoch": 0.44,
"learning_rate": 9.570700944819582e-07,
"logits/chosen": 0.23208096623420715,
"logits/rejected": 0.35697174072265625,
"logps/chosen": -382.19970703125,
"logps/rejected": -386.50701904296875,
"loss": 0.0708,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6804240942001343,
"rewards/margins": 0.48590850830078125,
"rewards/rejected": -1.166332721710205,
"step": 210
},
{
"epoch": 0.46,
"learning_rate": 9.493435061259129e-07,
"logits/chosen": 0.13639363646507263,
"logits/rejected": 0.23731064796447754,
"logps/chosen": -382.42022705078125,
"logps/rejected": -369.6554870605469,
"loss": 0.0763,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6574115753173828,
"rewards/margins": 0.40243881940841675,
"rewards/rejected": -1.0598504543304443,
"step": 220
},
{
"epoch": 0.48,
"learning_rate": 9.4101455916603e-07,
"logits/chosen": 0.1799091249704361,
"logits/rejected": 0.2304597645998001,
"logps/chosen": -416.672607421875,
"logps/rejected": -420.39862060546875,
"loss": 0.0668,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.9061130285263062,
"rewards/margins": 0.46666598320007324,
"rewards/rejected": -1.3727790117263794,
"step": 230
},
{
"epoch": 0.5,
"learning_rate": 9.320944188084241e-07,
"logits/chosen": 0.08318189531564713,
"logits/rejected": 0.13486048579216003,
"logps/chosen": -408.77545166015625,
"logps/rejected": -427.9566345214844,
"loss": 0.0639,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.968237042427063,
"rewards/margins": 0.2922549843788147,
"rewards/rejected": -1.260491967201233,
"step": 240
},
{
"epoch": 0.52,
"learning_rate": 9.225950427718974e-07,
"logits/chosen": 0.051157813519239426,
"logits/rejected": 0.1319509893655777,
"logps/chosen": -385.2474670410156,
"logps/rejected": -402.11126708984375,
"loss": 0.0631,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.7319932579994202,
"rewards/margins": 0.468679815530777,
"rewards/rejected": -1.2006731033325195,
"step": 250
},
{
"epoch": 0.54,
"learning_rate": 9.125291652582547e-07,
"logits/chosen": 0.013853952288627625,
"logits/rejected": 0.10071275383234024,
"logps/chosen": -445.53607177734375,
"logps/rejected": -434.2711486816406,
"loss": 0.0641,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.9089228510856628,
"rewards/margins": 0.4331666827201843,
"rewards/rejected": -1.3420894145965576,
"step": 260
},
{
"epoch": 0.57,
"learning_rate": 9.019102798817195e-07,
"logits/chosen": 0.1297096163034439,
"logits/rejected": 0.1613592505455017,
"logps/chosen": -403.47393798828125,
"logps/rejected": -446.1951599121094,
"loss": 0.0685,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7434005737304688,
"rewards/margins": 0.6140644550323486,
"rewards/rejected": -1.357465147972107,
"step": 270
},
{
"epoch": 0.59,
"learning_rate": 8.90752621580335e-07,
"logits/chosen": 0.16231071949005127,
"logits/rejected": 0.1873283088207245,
"logps/chosen": -362.4006652832031,
"logps/rejected": -398.279296875,
"loss": 0.0751,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6591774225234985,
"rewards/margins": 0.41294485330581665,
"rewards/rejected": -1.07212233543396,
"step": 280
},
{
"epoch": 0.61,
"learning_rate": 8.79071147533597e-07,
"logits/chosen": 0.14204099774360657,
"logits/rejected": 0.20997166633605957,
"logps/chosen": -424.5856018066406,
"logps/rejected": -456.9698181152344,
"loss": 0.0642,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7815448045730591,
"rewards/margins": 0.5602203011512756,
"rewards/rejected": -1.34176504611969,
"step": 290
},
{
"epoch": 0.63,
"learning_rate": 8.668815171119019e-07,
"logits/chosen": 0.2026984989643097,
"logits/rejected": 0.23374077677726746,
"logps/chosen": -380.8060607910156,
"logps/rejected": -468.7802734375,
"loss": 0.0554,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8905105590820312,
"rewards/margins": 0.5638677477836609,
"rewards/rejected": -1.454378366470337,
"step": 300
},
{
"epoch": 0.65,
"learning_rate": 8.54200070884685e-07,
"logits/chosen": 0.23336808383464813,
"logits/rejected": 0.25176650285720825,
"logps/chosen": -385.24676513671875,
"logps/rejected": -462.87322998046875,
"loss": 0.0565,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8951492309570312,
"rewards/margins": 0.6165014505386353,
"rewards/rejected": -1.5116506814956665,
"step": 310
},
{
"epoch": 0.67,
"learning_rate": 8.410438087153911e-07,
"logits/chosen": 0.22913236916065216,
"logits/rejected": 0.3360585570335388,
"logps/chosen": -383.767578125,
"logps/rejected": -424.25067138671875,
"loss": 0.0641,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6813658475875854,
"rewards/margins": 0.6591276526451111,
"rewards/rejected": -1.3404934406280518,
"step": 320
},
{
"epoch": 0.69,
"learning_rate": 8.274303669726426e-07,
"logits/chosen": 0.22990348935127258,
"logits/rejected": 0.3006184697151184,
"logps/chosen": -366.43499755859375,
"logps/rejected": -444.06536865234375,
"loss": 0.0636,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6766657829284668,
"rewards/margins": 0.6564770936965942,
"rewards/rejected": -1.333142876625061,
"step": 330
},
{
"epoch": 0.71,
"learning_rate": 8.133779948881513e-07,
"logits/chosen": 0.22257550060749054,
"logits/rejected": 0.3241097033023834,
"logps/chosen": -360.141845703125,
"logps/rejected": -405.85711669921875,
"loss": 0.0662,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7344536781311035,
"rewards/margins": 0.7157880067825317,
"rewards/rejected": -1.4502416849136353,
"step": 340
},
{
"epoch": 0.73,
"learning_rate": 7.989055300930704e-07,
"logits/chosen": 0.1499968320131302,
"logits/rejected": 0.15372925996780396,
"logps/chosen": -388.67559814453125,
"logps/rejected": -462.0445251464844,
"loss": 0.0644,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8717344403266907,
"rewards/margins": 0.6429644227027893,
"rewards/rejected": -1.51469886302948,
"step": 350
},
{
"epoch": 0.75,
"learning_rate": 7.840323733655778e-07,
"logits/chosen": 0.08885981142520905,
"logits/rejected": 0.19541098177433014,
"logps/chosen": -407.87286376953125,
"logps/rejected": -420.4515686035156,
"loss": 0.0583,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.797155499458313,
"rewards/margins": 0.5855330228805542,
"rewards/rejected": -1.3826884031295776,
"step": 360
},
{
"epoch": 0.77,
"learning_rate": 7.687784626235447e-07,
"logits/chosen": 0.05912008136510849,
"logits/rejected": 0.17702099680900574,
"logps/chosen": -428.82354736328125,
"logps/rejected": -466.0895080566406,
"loss": 0.0599,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.803920567035675,
"rewards/margins": 0.7507921457290649,
"rewards/rejected": -1.5547125339508057,
"step": 370
},
{
"epoch": 0.8,
"learning_rate": 7.531642461971514e-07,
"logits/chosen": 0.11388075351715088,
"logits/rejected": 0.1931450068950653,
"logps/chosen": -388.9282531738281,
"logps/rejected": -427.1614685058594,
"loss": 0.0578,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.9585503339767456,
"rewards/margins": 0.5912213325500488,
"rewards/rejected": -1.5497716665267944,
"step": 380
},
{
"epoch": 0.82,
"learning_rate": 7.372106554172801e-07,
"logits/chosen": -0.049389470368623734,
"logits/rejected": 0.10218650102615356,
"logps/chosen": -443.7737731933594,
"logps/rejected": -484.5735778808594,
"loss": 0.0446,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0208237171173096,
"rewards/margins": 0.8150562047958374,
"rewards/rejected": -1.835879921913147,
"step": 390
},
{
"epoch": 0.84,
"learning_rate": 7.209390765564318e-07,
"logits/chosen": 0.07526848465204239,
"logits/rejected": 0.1457681804895401,
"logps/chosen": -430.77130126953125,
"logps/rejected": -478.53118896484375,
"loss": 0.0488,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.137662410736084,
"rewards/margins": 0.6997725963592529,
"rewards/rejected": -1.837435007095337,
"step": 400
},
{
"epoch": 0.86,
"learning_rate": 7.043713221597773e-07,
"logits/chosen": -0.014962440356612206,
"logits/rejected": 0.049673158675432205,
"logps/chosen": -394.35980224609375,
"logps/rejected": -455.79168701171875,
"loss": 0.0469,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.0516221523284912,
"rewards/margins": 0.6002627015113831,
"rewards/rejected": -1.65188467502594,
"step": 410
},
{
"epoch": 0.88,
"learning_rate": 6.875296018047809e-07,
"logits/chosen": 0.1113734096288681,
"logits/rejected": 0.17297616600990295,
"logps/chosen": -371.1769104003906,
"logps/rejected": -433.82763671875,
"loss": 0.057,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7784308791160583,
"rewards/margins": 0.7032991647720337,
"rewards/rejected": -1.4817302227020264,
"step": 420
},
{
"epoch": 0.9,
"learning_rate": 6.704364923285857e-07,
"logits/chosen": 0.08021976053714752,
"logits/rejected": 0.09611347317695618,
"logps/chosen": -433.26898193359375,
"logps/rejected": -482.2544860839844,
"loss": 0.0623,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9919212460517883,
"rewards/margins": 0.5928072333335876,
"rewards/rejected": -1.584728479385376,
"step": 430
},
{
"epoch": 0.92,
"learning_rate": 6.531149075630796e-07,
"logits/chosen": 0.06492827087640762,
"logits/rejected": 0.09372309595346451,
"logps/chosen": -369.0657958984375,
"logps/rejected": -427.1637268066406,
"loss": 0.0602,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8450859785079956,
"rewards/margins": 0.6487796902656555,
"rewards/rejected": -1.4938656091690063,
"step": 440
},
{
"epoch": 0.94,
"learning_rate": 6.355880676182085e-07,
"logits/chosen": 0.015085640363395214,
"logits/rejected": 0.1697283238172531,
"logps/chosen": -454.42071533203125,
"logps/rejected": -461.6656799316406,
"loss": 0.0537,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0446925163269043,
"rewards/margins": 0.7324589490890503,
"rewards/rejected": -1.7771514654159546,
"step": 450
},
{
"epoch": 0.96,
"learning_rate": 6.178794677547137e-07,
"logits/chosen": 0.052903078496456146,
"logits/rejected": 0.21909013390541077,
"logps/chosen": -389.771728515625,
"logps/rejected": -432.63311767578125,
"loss": 0.0475,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.918341338634491,
"rewards/margins": 0.7504295706748962,
"rewards/rejected": -1.6687707901000977,
"step": 460
},
{
"epoch": 0.98,
"learning_rate": 6.000128468880222e-07,
"logits/chosen": 0.0020152360666543245,
"logits/rejected": 0.10528425872325897,
"logps/chosen": -439.73016357421875,
"logps/rejected": -486.3055114746094,
"loss": 0.0531,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0058103799819946,
"rewards/margins": 0.8824182748794556,
"rewards/rejected": -1.8882286548614502,
"step": 470
},
{
"epoch": 1.0,
"learning_rate": 5.820121557655108e-07,
"logits/chosen": 0.03267590329051018,
"logits/rejected": 0.10403893887996674,
"logps/chosen": -426.3312072753906,
"logps/rejected": -521.575439453125,
"loss": 0.0497,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.897496223449707,
"rewards/margins": 1.0473217964172363,
"rewards/rejected": -1.9448179006576538,
"step": 480
},
{
"epoch": 1.03,
"learning_rate": 5.639015248598023e-07,
"logits/chosen": -0.05066138505935669,
"logits/rejected": 0.0016520231729373336,
"logps/chosen": -459.2066955566406,
"logps/rejected": -572.3805541992188,
"loss": 0.0254,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.404326319694519,
"rewards/margins": 1.2682745456695557,
"rewards/rejected": -2.6726012229919434,
"step": 490
},
{
"epoch": 1.05,
"learning_rate": 5.457052320211339e-07,
"logits/chosen": 0.10663177818059921,
"logits/rejected": 0.143524631857872,
"logps/chosen": -454.5547790527344,
"logps/rejected": -574.3235473632812,
"loss": 0.0198,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.592284083366394,
"rewards/margins": 1.2184875011444092,
"rewards/rejected": -2.8107717037200928,
"step": 500
},
{
"epoch": 1.07,
"learning_rate": 5.274476699321637e-07,
"logits/chosen": -0.019788045436143875,
"logits/rejected": 0.12656378746032715,
"logps/chosen": -488.24627685546875,
"logps/rejected": -596.00537109375,
"loss": 0.015,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8213142156600952,
"rewards/margins": 1.3653538227081299,
"rewards/rejected": -3.1866683959960938,
"step": 510
},
{
"epoch": 1.09,
"learning_rate": 5.091533134088387e-07,
"logits/chosen": -0.0814504474401474,
"logits/rejected": 0.05524957925081253,
"logps/chosen": -552.7730712890625,
"logps/rejected": -634.5548095703125,
"loss": 0.0147,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0995850563049316,
"rewards/margins": 1.1655638217926025,
"rewards/rejected": -3.2651493549346924,
"step": 520
},
{
"epoch": 1.11,
"learning_rate": 4.908466865911614e-07,
"logits/chosen": 0.03363295644521713,
"logits/rejected": 0.043015364557504654,
"logps/chosen": -468.89593505859375,
"logps/rejected": -560.2864990234375,
"loss": 0.0174,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.5512639284133911,
"rewards/margins": 1.2513355016708374,
"rewards/rejected": -2.8025994300842285,
"step": 530
},
{
"epoch": 1.13,
"learning_rate": 4.7255233006783624e-07,
"logits/chosen": -0.03754299506545067,
"logits/rejected": 0.08725563436746597,
"logps/chosen": -456.68243408203125,
"logps/rejected": -549.9105224609375,
"loss": 0.0178,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.657478928565979,
"rewards/margins": 1.0530353784561157,
"rewards/rejected": -2.7105140686035156,
"step": 540
},
{
"epoch": 1.15,
"learning_rate": 4.5429476797886617e-07,
"logits/chosen": 0.0340617299079895,
"logits/rejected": 0.1264275759458542,
"logps/chosen": -469.5687561035156,
"logps/rejected": -592.4705810546875,
"loss": 0.0185,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.408406138420105,
"rewards/margins": 1.4667712450027466,
"rewards/rejected": -2.8751769065856934,
"step": 550
},
{
"epoch": 1.17,
"learning_rate": 4.3609847514019763e-07,
"logits/chosen": 0.0167356226593256,
"logits/rejected": 0.032135289162397385,
"logps/chosen": -480.41278076171875,
"logps/rejected": -577.2174072265625,
"loss": 0.0165,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.5578255653381348,
"rewards/margins": 1.0947318077087402,
"rewards/rejected": -2.652557611465454,
"step": 560
},
{
"epoch": 1.19,
"learning_rate": 4.179878442344892e-07,
"logits/chosen": 0.10041844844818115,
"logits/rejected": 0.16732005774974823,
"logps/chosen": -453.9161071777344,
"logps/rejected": -615.6796875,
"loss": 0.0153,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7070415019989014,
"rewards/margins": 1.4755295515060425,
"rewards/rejected": -3.1825711727142334,
"step": 570
},
{
"epoch": 1.21,
"learning_rate": 3.9998715311197783e-07,
"logits/chosen": 0.1310591995716095,
"logits/rejected": 0.20585906505584717,
"logps/chosen": -493.8118591308594,
"logps/rejected": -631.4963989257812,
"loss": 0.015,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.7850983142852783,
"rewards/margins": 1.443263292312622,
"rewards/rejected": -3.228361129760742,
"step": 580
},
{
"epoch": 1.24,
"learning_rate": 3.821205322452863e-07,
"logits/chosen": 0.22954685986042023,
"logits/rejected": 0.2483092099428177,
"logps/chosen": -473.4378967285156,
"logps/rejected": -605.134033203125,
"loss": 0.0149,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.700280785560608,
"rewards/margins": 1.460669755935669,
"rewards/rejected": -3.1609506607055664,
"step": 590
},
{
"epoch": 1.26,
"learning_rate": 3.6441193238179146e-07,
"logits/chosen": 0.13607949018478394,
"logits/rejected": 0.1680508852005005,
"logps/chosen": -451.55340576171875,
"logps/rejected": -627.7686157226562,
"loss": 0.0147,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6148862838745117,
"rewards/margins": 1.678989052772522,
"rewards/rejected": -3.2938759326934814,
"step": 600
},
{
"epoch": 1.28,
"learning_rate": 3.4688509243692034e-07,
"logits/chosen": 0.04345204681158066,
"logits/rejected": 0.13040025532245636,
"logps/chosen": -461.54095458984375,
"logps/rejected": -684.9581909179688,
"loss": 0.0153,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.6480602025985718,
"rewards/margins": 1.6946277618408203,
"rewards/rejected": -3.3426880836486816,
"step": 610
},
{
"epoch": 1.3,
"learning_rate": 3.295635076714144e-07,
"logits/chosen": 0.18233785033226013,
"logits/rejected": 0.19972297549247742,
"logps/chosen": -408.9209899902344,
"logps/rejected": -547.9658813476562,
"loss": 0.0143,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6356074810028076,
"rewards/margins": 1.3703811168670654,
"rewards/rejected": -3.005988597869873,
"step": 620
},
{
"epoch": 1.32,
"learning_rate": 3.12470398195219e-07,
"logits/chosen": 0.15017299354076385,
"logits/rejected": 0.07167269289493561,
"logps/chosen": -474.58172607421875,
"logps/rejected": -649.4796142578125,
"loss": 0.0129,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6831333637237549,
"rewards/margins": 1.4837870597839355,
"rewards/rejected": -3.1669201850891113,
"step": 630
},
{
"epoch": 1.34,
"learning_rate": 2.956286778402226e-07,
"logits/chosen": 0.03866753727197647,
"logits/rejected": 0.20129835605621338,
"logps/chosen": -546.3468017578125,
"logps/rejected": -608.462646484375,
"loss": 0.0126,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7091865539550781,
"rewards/margins": 1.3178246021270752,
"rewards/rejected": -3.0270111560821533,
"step": 640
},
{
"epoch": 1.36,
"learning_rate": 2.7906092344356826e-07,
"logits/chosen": 0.2127591073513031,
"logits/rejected": 0.24179625511169434,
"logps/chosen": -462.47412109375,
"logps/rejected": -581.084228515625,
"loss": 0.014,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.751960039138794,
"rewards/margins": 1.4448457956314087,
"rewards/rejected": -3.196805953979492,
"step": 650
},
{
"epoch": 1.38,
"learning_rate": 2.6278934458271996e-07,
"logits/chosen": 0.09269841015338898,
"logits/rejected": 0.2964209318161011,
"logps/chosen": -479.434326171875,
"logps/rejected": -605.9524536132812,
"loss": 0.0123,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.8022867441177368,
"rewards/margins": 1.3753817081451416,
"rewards/rejected": -3.177668333053589,
"step": 660
},
{
"epoch": 1.4,
"learning_rate": 2.468357538028487e-07,
"logits/chosen": 0.16141146421432495,
"logits/rejected": 0.18542757630348206,
"logps/chosen": -487.90277099609375,
"logps/rejected": -652.5034790039062,
"loss": 0.0107,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9332258701324463,
"rewards/margins": 1.736053705215454,
"rewards/rejected": -3.6692795753479004,
"step": 670
},
{
"epoch": 1.42,
"learning_rate": 2.312215373764551e-07,
"logits/chosen": 0.07799498736858368,
"logits/rejected": 0.17718131840229034,
"logps/chosen": -603.2567138671875,
"logps/rejected": -699.2156372070312,
"loss": 0.0101,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1482930183410645,
"rewards/margins": 1.3787685632705688,
"rewards/rejected": -3.5270614624023438,
"step": 680
},
{
"epoch": 1.44,
"learning_rate": 2.1596762663442213e-07,
"logits/chosen": 0.2014874666929245,
"logits/rejected": 0.3246391713619232,
"logps/chosen": -489.08349609375,
"logps/rejected": -607.5847778320312,
"loss": 0.0096,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.083740711212158,
"rewards/margins": 1.446257472038269,
"rewards/rejected": -3.5299980640411377,
"step": 690
},
{
"epoch": 1.47,
"learning_rate": 2.0109446990692963e-07,
"logits/chosen": 0.09734896570444107,
"logits/rejected": 0.16283641755580902,
"logps/chosen": -540.1688232421875,
"logps/rejected": -701.462890625,
"loss": 0.0094,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.07643985748291,
"rewards/margins": 1.7090556621551514,
"rewards/rejected": -3.7854957580566406,
"step": 700
},
{
"epoch": 1.49,
"learning_rate": 1.8662200511184872e-07,
"logits/chosen": 0.07912759482860565,
"logits/rejected": 0.19963078200817108,
"logps/chosen": -491.30426025390625,
"logps/rejected": -630.0563354492188,
"loss": 0.0099,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9977525472640991,
"rewards/margins": 1.5802443027496338,
"rewards/rejected": -3.5779967308044434,
"step": 710
},
{
"epoch": 1.51,
"learning_rate": 1.725696330273575e-07,
"logits/chosen": 0.14783975481987,
"logits/rejected": 0.27563345432281494,
"logps/chosen": -530.8796997070312,
"logps/rejected": -640.3440551757812,
"loss": 0.0107,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.036653518676758,
"rewards/margins": 1.323557734489441,
"rewards/rejected": -3.3602116107940674,
"step": 720
},
{
"epoch": 1.53,
"learning_rate": 1.589561912846089e-07,
"logits/chosen": 0.16717246174812317,
"logits/rejected": 0.2920343279838562,
"logps/chosen": -499.3802795410156,
"logps/rejected": -612.64892578125,
"loss": 0.012,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.0618550777435303,
"rewards/margins": 1.435462236404419,
"rewards/rejected": -3.4973175525665283,
"step": 730
},
{
"epoch": 1.55,
"learning_rate": 1.4579992911531496e-07,
"logits/chosen": 0.1249130517244339,
"logits/rejected": 0.23616066575050354,
"logps/chosen": -575.0750732421875,
"logps/rejected": -649.9669189453125,
"loss": 0.0106,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.2815146446228027,
"rewards/margins": 1.226216197013855,
"rewards/rejected": -3.5077309608459473,
"step": 740
},
{
"epoch": 1.57,
"learning_rate": 1.3311848288809813e-07,
"logits/chosen": 0.21837782859802246,
"logits/rejected": 0.31546956300735474,
"logps/chosen": -510.7059020996094,
"logps/rejected": -609.2933959960938,
"loss": 0.0119,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.790372610092163,
"rewards/margins": 1.2426694631576538,
"rewards/rejected": -3.0330421924591064,
"step": 750
},
{
"epoch": 1.59,
"learning_rate": 1.209288524664029e-07,
"logits/chosen": 0.14562873542308807,
"logits/rejected": 0.3084864318370819,
"logps/chosen": -622.6912841796875,
"logps/rejected": -749.8731689453125,
"loss": 0.0131,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.2252538204193115,
"rewards/margins": 1.5818650722503662,
"rewards/rejected": -3.8071188926696777,
"step": 760
},
{
"epoch": 1.61,
"learning_rate": 1.0924737841966497e-07,
"logits/chosen": 0.1799144446849823,
"logits/rejected": 0.354133278131485,
"logps/chosen": -585.0472412109375,
"logps/rejected": -712.3133544921875,
"loss": 0.0107,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1570990085601807,
"rewards/margins": 1.6586040258407593,
"rewards/rejected": -3.8157036304473877,
"step": 770
},
{
"epoch": 1.63,
"learning_rate": 9.808972011828054e-08,
"logits/chosen": 0.20896565914154053,
"logits/rejected": 0.1832619458436966,
"logps/chosen": -474.9366149902344,
"logps/rejected": -665.3892822265625,
"loss": 0.0099,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.9308887720108032,
"rewards/margins": 1.5281493663787842,
"rewards/rejected": -3.459038257598877,
"step": 780
},
{
"epoch": 1.65,
"learning_rate": 8.747083474174527e-08,
"logits/chosen": 0.25221484899520874,
"logits/rejected": 0.3025228679180145,
"logps/chosen": -486.76678466796875,
"logps/rejected": -610.9810791015625,
"loss": 0.01,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9139289855957031,
"rewards/margins": 1.4173685312271118,
"rewards/rejected": -3.3312973976135254,
"step": 790
},
{
"epoch": 1.67,
"learning_rate": 7.740495722810269e-08,
"logits/chosen": 0.12703558802604675,
"logits/rejected": 0.25433093309402466,
"logps/chosen": -528.8013916015625,
"logps/rejected": -645.4374389648438,
"loss": 0.01,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.129984140396118,
"rewards/margins": 1.322923183441162,
"rewards/rejected": -3.452907085418701,
"step": 800
},
{
"epoch": 1.7,
"learning_rate": 6.790558119157597e-08,
"logits/chosen": 0.1941952407360077,
"logits/rejected": 0.36538344621658325,
"logps/chosen": -536.0458374023438,
"logps/rejected": -630.6697387695312,
"loss": 0.0111,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9618316888809204,
"rewards/margins": 1.3840124607086182,
"rewards/rejected": -3.34584379196167,
"step": 810
},
{
"epoch": 1.72,
"learning_rate": 5.898544083397e-08,
"logits/chosen": 0.1936766654253006,
"logits/rejected": 0.22626741230487823,
"logps/chosen": -482.18902587890625,
"logps/rejected": -640.9258422851562,
"loss": 0.0113,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.822951078414917,
"rewards/margins": 1.679091215133667,
"rewards/rejected": -3.502042055130005,
"step": 820
},
{
"epoch": 1.74,
"learning_rate": 5.065649387408705e-08,
"logits/chosen": 0.16037659347057343,
"logits/rejected": 0.23867423832416534,
"logps/chosen": -536.796630859375,
"logps/rejected": -645.6795654296875,
"loss": 0.0119,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.09273624420166,
"rewards/margins": 1.3475998640060425,
"rewards/rejected": -3.440336227416992,
"step": 830
},
{
"epoch": 1.76,
"learning_rate": 4.292990551804171e-08,
"logits/chosen": 0.11955185234546661,
"logits/rejected": 0.2987907826900482,
"logps/chosen": -521.8675537109375,
"logps/rejected": -622.3560791015625,
"loss": 0.0115,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9727070331573486,
"rewards/margins": 1.207002878189087,
"rewards/rejected": -3.1797099113464355,
"step": 840
},
{
"epoch": 1.78,
"learning_rate": 3.581603349196371e-08,
"logits/chosen": 0.12183141708374023,
"logits/rejected": 0.24950018525123596,
"logps/chosen": -529.2427978515625,
"logps/rejected": -662.9299926757812,
"loss": 0.0112,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.024509906768799,
"rewards/margins": 1.5907318592071533,
"rewards/rejected": -3.615241289138794,
"step": 850
},
{
"epoch": 1.8,
"learning_rate": 2.9324414157151367e-08,
"logits/chosen": 0.11247365176677704,
"logits/rejected": 0.28803473711013794,
"logps/chosen": -538.6015625,
"logps/rejected": -616.6097412109375,
"loss": 0.0105,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.005286693572998,
"rewards/margins": 1.320533037185669,
"rewards/rejected": -3.325819492340088,
"step": 860
},
{
"epoch": 1.82,
"learning_rate": 2.3463749726290284e-08,
"logits/chosen": 0.09726160764694214,
"logits/rejected": 0.3085189759731293,
"logps/chosen": -527.7420043945312,
"logps/rejected": -666.7064208984375,
"loss": 0.0114,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.963595986366272,
"rewards/margins": 1.6061077117919922,
"rewards/rejected": -3.5697035789489746,
"step": 870
},
{
"epoch": 1.84,
"learning_rate": 1.824189659787284e-08,
"logits/chosen": 0.19652321934700012,
"logits/rejected": 0.2885872423648834,
"logps/chosen": -515.560546875,
"logps/rejected": -641.10791015625,
"loss": 0.0111,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9605176448822021,
"rewards/margins": 1.3721264600753784,
"rewards/rejected": -3.33264422416687,
"step": 880
},
{
"epoch": 1.86,
"learning_rate": 1.3665854824458035e-08,
"logits/chosen": 0.16733339428901672,
"logits/rejected": 0.3634529113769531,
"logps/chosen": -542.18505859375,
"logps/rejected": -629.7310791015625,
"loss": 0.0115,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0391831398010254,
"rewards/margins": 1.1835925579071045,
"rewards/rejected": -3.2227752208709717,
"step": 890
},
{
"epoch": 1.88,
"learning_rate": 9.741758728888217e-09,
"logits/chosen": 0.08950433880090714,
"logits/rejected": 0.2665843069553375,
"logps/chosen": -533.1641845703125,
"logps/rejected": -621.0523681640625,
"loss": 0.0113,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9605424404144287,
"rewards/margins": 1.1125773191452026,
"rewards/rejected": -3.073119640350342,
"step": 900
},
{
"epoch": 1.91,
"learning_rate": 6.474868681043577e-09,
"logits/chosen": 0.13345034420490265,
"logits/rejected": 0.2458508014678955,
"logps/chosen": -523.0572509765625,
"logps/rejected": -666.5548706054688,
"loss": 0.0107,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.094968557357788,
"rewards/margins": 1.4136923551559448,
"rewards/rejected": -3.5086607933044434,
"step": 910
},
{
"epoch": 1.93,
"learning_rate": 3.869564046156459e-09,
"logits/chosen": 0.17636564373970032,
"logits/rejected": 0.24904970824718475,
"logps/chosen": -521.7586669921875,
"logps/rejected": -661.547119140625,
"loss": 0.0115,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.0953400135040283,
"rewards/margins": 1.3953152894973755,
"rewards/rejected": -3.4906551837921143,
"step": 920
},
{
"epoch": 1.95,
"learning_rate": 1.929337314139412e-09,
"logits/chosen": 0.1708141714334488,
"logits/rejected": 0.2874212861061096,
"logps/chosen": -481.3929138183594,
"logps/rejected": -591.492431640625,
"loss": 0.0107,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8482071161270142,
"rewards/margins": 1.3176212310791016,
"rewards/rejected": -3.165828227996826,
"step": 930
},
{
"epoch": 1.97,
"learning_rate": 6.567894177967325e-10,
"logits/chosen": 0.1810809224843979,
"logits/rejected": 0.3499010503292084,
"logps/chosen": -509.21966552734375,
"logps/rejected": -619.0591430664062,
"loss": 0.0119,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7878868579864502,
"rewards/margins": 1.3797376155853271,
"rewards/rejected": -3.1676242351531982,
"step": 940
},
{
"epoch": 1.99,
"learning_rate": 5.3626246194704575e-11,
"logits/chosen": 0.12432925403118134,
"logits/rejected": 0.1847553700208664,
"logps/chosen": -471.4737854003906,
"logps/rejected": -620.7115478515625,
"loss": 0.0121,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8229620456695557,
"rewards/margins": 1.5415856838226318,
"rewards/rejected": -3.3645477294921875,
"step": 950
},
{
"epoch": 2.0,
"step": 954,
"total_flos": 0.0,
"train_loss": 0.050850671487596796,
"train_runtime": 12712.7589,
"train_samples_per_second": 9.618,
"train_steps_per_second": 0.075
}
],
"logging_steps": 10,
"max_steps": 954,
"num_train_epochs": 2,
"save_steps": 10000,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}