zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
b622b6f verified
raw
history blame
22.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987908101571947,
"eval_steps": 10000000,
"global_step": 413,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 6401.270603874373,
"learning_rate": 9.523809523809522e-09,
"logits/chosen": -2.7005977630615234,
"logits/rejected": -2.6288318634033203,
"logps/chosen": -1.1158788204193115,
"logps/rejected": -1.1333446502685547,
"loss": 0.7544,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 9369.590990783972,
"learning_rate": 9.523809523809523e-08,
"logits/chosen": -2.76228666305542,
"logits/rejected": -2.6970374584198,
"logps/chosen": -0.837486743927002,
"logps/rejected": -0.8182350993156433,
"loss": 0.9695,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 0.06597563624382019,
"rewards/margins": 0.437710702419281,
"rewards/rejected": -0.3717350959777832,
"step": 10
},
{
"epoch": 0.05,
"grad_norm": 5966.657402243146,
"learning_rate": 1.9047619047619045e-07,
"logits/chosen": -2.6901049613952637,
"logits/rejected": -2.6502909660339355,
"logps/chosen": -0.9933319091796875,
"logps/rejected": -1.0394352674484253,
"loss": 1.0318,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.794396698474884,
"rewards/margins": 0.7471516728401184,
"rewards/rejected": -1.5415483713150024,
"step": 20
},
{
"epoch": 0.07,
"grad_norm": 8820.198504372876,
"learning_rate": 2.857142857142857e-07,
"logits/chosen": -2.7333264350891113,
"logits/rejected": -2.6793360710144043,
"logps/chosen": -0.9710652232170105,
"logps/rejected": -0.9799602627754211,
"loss": 1.3198,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.275942325592041,
"rewards/margins": 0.9020620584487915,
"rewards/rejected": -3.178004264831543,
"step": 30
},
{
"epoch": 0.1,
"grad_norm": 8453.783513094899,
"learning_rate": 3.809523809523809e-07,
"logits/chosen": -2.6771621704101562,
"logits/rejected": -2.6321842670440674,
"logps/chosen": -0.989823043346405,
"logps/rejected": -0.9216930270195007,
"loss": 2.0555,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6586966514587402,
"rewards/margins": 5.100310325622559,
"rewards/rejected": -5.759006500244141,
"step": 40
},
{
"epoch": 0.12,
"grad_norm": 4192.139232222726,
"learning_rate": 3.995412608484087e-07,
"logits/chosen": -2.743403911590576,
"logits/rejected": -2.6878693103790283,
"logps/chosen": -0.9671042561531067,
"logps/rejected": -0.917597770690918,
"loss": 2.6495,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.16885781288146973,
"rewards/margins": 5.186079978942871,
"rewards/rejected": -5.35493803024292,
"step": 50
},
{
"epoch": 0.15,
"grad_norm": 5643.860863524967,
"learning_rate": 3.976812391485896e-07,
"logits/chosen": -2.7438769340515137,
"logits/rejected": -2.676765203475952,
"logps/chosen": -0.911353588104248,
"logps/rejected": -0.9122518301010132,
"loss": 3.8047,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 2.4976494312286377,
"rewards/margins": 5.426072120666504,
"rewards/rejected": -2.928422212600708,
"step": 60
},
{
"epoch": 0.17,
"grad_norm": 4497.230754903385,
"learning_rate": 3.9440458281608213e-07,
"logits/chosen": -2.740940570831299,
"logits/rejected": -2.7162723541259766,
"logps/chosen": -0.9154363870620728,
"logps/rejected": -0.868497371673584,
"loss": 3.6432,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 5.681364059448242,
"rewards/margins": 8.000432968139648,
"rewards/rejected": -2.319068431854248,
"step": 70
},
{
"epoch": 0.19,
"grad_norm": 9843.974275847575,
"learning_rate": 3.897347732134074e-07,
"logits/chosen": -2.679215908050537,
"logits/rejected": -2.625516891479492,
"logps/chosen": -0.9146322011947632,
"logps/rejected": -1.0181081295013428,
"loss": 5.767,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -10.08639907836914,
"rewards/margins": 6.582289695739746,
"rewards/rejected": -16.668689727783203,
"step": 80
},
{
"epoch": 0.22,
"grad_norm": 4773.013380320505,
"learning_rate": 3.8370527539794614e-07,
"logits/chosen": -2.6771388053894043,
"logits/rejected": -2.6291418075561523,
"logps/chosen": -1.003847360610962,
"logps/rejected": -1.0297266244888306,
"loss": 4.6354,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 4.863407611846924,
"rewards/margins": 9.78220272064209,
"rewards/rejected": -4.918795585632324,
"step": 90
},
{
"epoch": 0.24,
"grad_norm": 3074.8663144850243,
"learning_rate": 3.763592983027255e-07,
"logits/chosen": -2.705735683441162,
"logits/rejected": -2.6605448722839355,
"logps/chosen": -0.9163268804550171,
"logps/rejected": -0.9396775960922241,
"loss": 5.8585,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -12.477940559387207,
"rewards/margins": 7.702305793762207,
"rewards/rejected": -20.180248260498047,
"step": 100
},
{
"epoch": 0.27,
"grad_norm": 4099.610429119441,
"learning_rate": 3.6774948509008527e-07,
"logits/chosen": -2.714970111846924,
"logits/rejected": -2.6705470085144043,
"logps/chosen": -0.9598251581192017,
"logps/rejected": -0.9319995641708374,
"loss": 5.1529,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 9.19798755645752,
"rewards/margins": 10.779365539550781,
"rewards/rejected": -1.5813770294189453,
"step": 110
},
{
"epoch": 0.29,
"grad_norm": 4599.711217449366,
"learning_rate": 3.579375358972288e-07,
"logits/chosen": -2.678779125213623,
"logits/rejected": -2.6315762996673584,
"logps/chosen": -0.9081487655639648,
"logps/rejected": -1.0060938596725464,
"loss": 4.0915,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -13.663342475891113,
"rewards/margins": 9.755656242370605,
"rewards/rejected": -23.418994903564453,
"step": 120
},
{
"epoch": 0.31,
"grad_norm": 4010.334966061441,
"learning_rate": 3.4699376567716156e-07,
"logits/chosen": -2.7230353355407715,
"logits/rejected": -2.684389591217041,
"logps/chosen": -0.8652521967887878,
"logps/rejected": -0.8799147605895996,
"loss": 4.4027,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 2.19469952583313,
"rewards/margins": 15.263641357421875,
"rewards/rejected": -13.068939208984375,
"step": 130
},
{
"epoch": 0.34,
"grad_norm": 5239.11146834966,
"learning_rate": 3.349966003036421e-07,
"logits/chosen": -2.689558506011963,
"logits/rejected": -2.649766445159912,
"logps/chosen": -0.9352903366088867,
"logps/rejected": -0.9416161775588989,
"loss": 4.7953,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.734063148498535,
"rewards/margins": 8.841203689575195,
"rewards/rejected": -13.575268745422363,
"step": 140
},
{
"epoch": 0.36,
"grad_norm": 5394.35498681908,
"learning_rate": 3.220320145511884e-07,
"logits/chosen": -2.7070841789245605,
"logits/rejected": -2.647737979888916,
"logps/chosen": -0.9441506266593933,
"logps/rejected": -0.9885166883468628,
"loss": 4.2219,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 5.9402689933776855,
"rewards/margins": 12.97706413269043,
"rewards/rejected": -7.036795139312744,
"step": 150
},
{
"epoch": 0.39,
"grad_norm": 5022.189692479379,
"learning_rate": 3.0819291597771795e-07,
"logits/chosen": -2.7051825523376465,
"logits/rejected": -2.667494297027588,
"logps/chosen": -0.911395251750946,
"logps/rejected": -0.939487099647522,
"loss": 4.7963,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -0.6114660501480103,
"rewards/margins": 9.443866729736328,
"rewards/rejected": -10.055331230163574,
"step": 160
},
{
"epoch": 0.41,
"grad_norm": 5428.944545727042,
"learning_rate": 2.9357847912507786e-07,
"logits/chosen": -2.6787288188934326,
"logits/rejected": -2.609421968460083,
"logps/chosen": -0.8976411819458008,
"logps/rejected": -0.8857674598693848,
"loss": 4.6262,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.82297945022583,
"rewards/margins": 10.640687942504883,
"rewards/rejected": -13.463666915893555,
"step": 170
},
{
"epoch": 0.44,
"grad_norm": 7317.882582449178,
"learning_rate": 2.7829343480875617e-07,
"logits/chosen": -2.6716930866241455,
"logits/rejected": -2.6018152236938477,
"logps/chosen": -0.9342878460884094,
"logps/rejected": -0.9536906480789185,
"loss": 4.5209,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 4.173262596130371,
"rewards/margins": 8.933877944946289,
"rewards/rejected": -4.760615348815918,
"step": 180
},
{
"epoch": 0.46,
"grad_norm": 5046.2946182405685,
"learning_rate": 2.624473195899052e-07,
"logits/chosen": -2.737992763519287,
"logits/rejected": -2.7089955806732178,
"logps/chosen": -0.9629039764404297,
"logps/rejected": -1.039236307144165,
"loss": 4.5521,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.3569388389587402,
"rewards/margins": 13.995905876159668,
"rewards/rejected": -17.352848052978516,
"step": 190
},
{
"epoch": 0.48,
"grad_norm": 4268.8163809344915,
"learning_rate": 2.4615369080815547e-07,
"logits/chosen": -2.6982626914978027,
"logits/rejected": -2.6629488468170166,
"logps/chosen": -0.8523995280265808,
"logps/rejected": -0.9246847033500671,
"loss": 3.8184,
"rewards/accuracies": 0.78125,
"rewards/chosen": 2.3979854583740234,
"rewards/margins": 4.996596336364746,
"rewards/rejected": -2.5986106395721436,
"step": 200
},
{
"epoch": 0.51,
"grad_norm": 4211.242306423206,
"learning_rate": 2.2952931280049625e-07,
"logits/chosen": -2.7346115112304688,
"logits/rejected": -2.6734609603881836,
"logps/chosen": -1.0063531398773193,
"logps/rejected": -0.9570119976997375,
"loss": 4.9954,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 7.86081600189209,
"rewards/margins": 13.075413703918457,
"rewards/rejected": -5.214597225189209,
"step": 210
},
{
"epoch": 0.53,
"grad_norm": 4906.2448320907815,
"learning_rate": 2.1269332013798747e-07,
"logits/chosen": -2.7431142330169678,
"logits/rejected": -2.7241249084472656,
"logps/chosen": -0.8835189938545227,
"logps/rejected": -0.8670462369918823,
"loss": 4.3795,
"rewards/accuracies": 0.78125,
"rewards/chosen": 4.582579135894775,
"rewards/margins": 8.150335311889648,
"rewards/rejected": -3.567755937576294,
"step": 220
},
{
"epoch": 0.56,
"grad_norm": 2995.5119741253625,
"learning_rate": 1.9576636387676436e-07,
"logits/chosen": -2.690732955932617,
"logits/rejected": -2.653067111968994,
"logps/chosen": -0.960831344127655,
"logps/rejected": -0.9556485414505005,
"loss": 4.0487,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -8.449748039245605,
"rewards/margins": 10.095115661621094,
"rewards/rejected": -18.544864654541016,
"step": 230
},
{
"epoch": 0.58,
"grad_norm": 3504.414329050279,
"learning_rate": 1.7886974694151976e-07,
"logits/chosen": -2.7119805812835693,
"logits/rejected": -2.6879172325134277,
"logps/chosen": -0.990290641784668,
"logps/rejected": -0.9934972524642944,
"loss": 4.3644,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.5965616106987,
"rewards/margins": 9.211602210998535,
"rewards/rejected": -9.808164596557617,
"step": 240
},
{
"epoch": 0.6,
"grad_norm": 3400.5848210057216,
"learning_rate": 1.6212455483752895e-07,
"logits/chosen": -2.756906270980835,
"logits/rejected": -2.6796135902404785,
"logps/chosen": -0.8838168978691101,
"logps/rejected": -0.9137406349182129,
"loss": 4.5034,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 6.5281982421875,
"rewards/margins": 8.64702033996582,
"rewards/rejected": -2.1188230514526367,
"step": 250
},
{
"epoch": 0.63,
"grad_norm": 6194.117841583386,
"learning_rate": 1.4565078792075733e-07,
"logits/chosen": -2.7132773399353027,
"logits/rejected": -2.6494650840759277,
"logps/chosen": -1.002362847328186,
"logps/rejected": -0.9982520341873169,
"loss": 4.8134,
"rewards/accuracies": 0.84375,
"rewards/chosen": 3.0224878787994385,
"rewards/margins": 16.206506729125977,
"rewards/rejected": -13.1840181350708,
"step": 260
},
{
"epoch": 0.65,
"grad_norm": 4565.495892627232,
"learning_rate": 1.295665014444281e-07,
"logits/chosen": -2.7381529808044434,
"logits/rejected": -2.6608738899230957,
"logps/chosen": -0.9501218795776367,
"logps/rejected": -0.9476363062858582,
"loss": 5.3754,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.570526123046875,
"rewards/margins": 12.367398262023926,
"rewards/rejected": -12.9379243850708,
"step": 270
},
{
"epoch": 0.68,
"grad_norm": 5337.153187944306,
"learning_rate": 1.1398695954469597e-07,
"logits/chosen": -2.6872425079345703,
"logits/rejected": -2.630267381668091,
"logps/chosen": -0.9056104421615601,
"logps/rejected": -0.8939152956008911,
"loss": 4.1053,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 4.294297218322754,
"rewards/margins": 7.472552299499512,
"rewards/rejected": -3.1782548427581787,
"step": 280
},
{
"epoch": 0.7,
"grad_norm": 3582.07962645892,
"learning_rate": 9.902380922818425e-08,
"logits/chosen": -2.7334370613098145,
"logits/rejected": -2.6919913291931152,
"logps/chosen": -0.9840775728225708,
"logps/rejected": -0.9756690263748169,
"loss": 3.2759,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 8.966680526733398,
"rewards/margins": 11.496904373168945,
"rewards/rejected": -2.5302233695983887,
"step": 290
},
{
"epoch": 0.73,
"grad_norm": 4767.591882910886,
"learning_rate": 8.478428028080398e-08,
"logits/chosen": -2.7305169105529785,
"logits/rejected": -2.6773815155029297,
"logps/chosen": -0.8988749384880066,
"logps/rejected": -0.9437707662582397,
"loss": 4.3175,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.8898951411247253,
"rewards/margins": 8.447718620300293,
"rewards/rejected": -7.55782413482666,
"step": 300
},
{
"epoch": 0.75,
"grad_norm": 4819.380329592898,
"learning_rate": 7.137041683151202e-08,
"logits/chosen": -2.7228643894195557,
"logits/rejected": -2.6581058502197266,
"logps/chosen": -1.0781683921813965,
"logps/rejected": -1.028840184211731,
"loss": 2.9744,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.4226202964782715,
"rewards/margins": 13.473236083984375,
"rewards/rejected": -15.895855903625488,
"step": 310
},
{
"epoch": 0.77,
"grad_norm": 7840.551721640683,
"learning_rate": 5.8878346077822135e-08,
"logits/chosen": -2.7280871868133545,
"logits/rejected": -2.649958848953247,
"logps/chosen": -0.9020591974258423,
"logps/rejected": -0.9361578822135925,
"loss": 2.7082,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 1.223115086555481,
"rewards/margins": 10.582406997680664,
"rewards/rejected": -9.35929012298584,
"step": 320
},
{
"epoch": 0.8,
"grad_norm": 4662.77535052248,
"learning_rate": 4.73975894135696e-08,
"logits/chosen": -2.6770853996276855,
"logits/rejected": -2.6099040508270264,
"logps/chosen": -0.9263202548027039,
"logps/rejected": -0.9608638882637024,
"loss": 3.1985,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.2365754395723343,
"rewards/margins": 13.195585250854492,
"rewards/rejected": -12.959010124206543,
"step": 330
},
{
"epoch": 0.82,
"grad_norm": 4550.588002339864,
"learning_rate": 3.701042089556483e-08,
"logits/chosen": -2.756493330001831,
"logits/rejected": -2.687851667404175,
"logps/chosen": -0.8901381492614746,
"logps/rejected": -0.9301478266716003,
"loss": 3.841,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.11963929980993271,
"rewards/margins": 7.3289618492126465,
"rewards/rejected": -7.209322929382324,
"step": 340
},
{
"epoch": 0.85,
"grad_norm": 5464.471487236709,
"learning_rate": 2.779127764652889e-08,
"logits/chosen": -2.689107656478882,
"logits/rejected": -2.6330015659332275,
"logps/chosen": -0.9756801724433899,
"logps/rejected": -0.9646003842353821,
"loss": 3.6421,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.1974527835845947,
"rewards/margins": 9.013090133666992,
"rewards/rejected": -10.210542678833008,
"step": 350
},
{
"epoch": 0.87,
"grad_norm": 5949.708940984834,
"learning_rate": 1.9806226419516193e-08,
"logits/chosen": -2.704460620880127,
"logits/rejected": -2.656071186065674,
"logps/chosen": -0.9623576402664185,
"logps/rejected": -1.0082406997680664,
"loss": 3.5231,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 2.5273587703704834,
"rewards/margins": 11.88086223602295,
"rewards/rejected": -9.35350227355957,
"step": 360
},
{
"epoch": 0.89,
"grad_norm": 4320.933402478669,
"learning_rate": 1.3112490146559552e-08,
"logits/chosen": -2.7451281547546387,
"logits/rejected": -2.686728000640869,
"logps/chosen": -0.8951610326766968,
"logps/rejected": -0.89850914478302,
"loss": 3.0053,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.6753175258636475,
"rewards/margins": 12.29626750946045,
"rewards/rejected": -11.620949745178223,
"step": 370
},
{
"epoch": 0.92,
"grad_norm": 2514.940389992379,
"learning_rate": 7.758037864413247e-09,
"logits/chosen": -2.7158432006835938,
"logits/rejected": -2.6906635761260986,
"logps/chosen": -0.9033122062683105,
"logps/rejected": -0.9709407091140747,
"loss": 2.8751,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.9628832936286926,
"rewards/margins": 10.83133316040039,
"rewards/rejected": -9.868449211120605,
"step": 380
},
{
"epoch": 0.94,
"grad_norm": 3504.225752431698,
"learning_rate": 3.78124095609087e-09,
"logits/chosen": -2.6947999000549316,
"logits/rejected": -2.6553878784179688,
"logps/chosen": -0.9263744354248047,
"logps/rejected": -0.9935058355331421,
"loss": 3.019,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 1.4072116613388062,
"rewards/margins": 10.741894721984863,
"rewards/rejected": -9.334683418273926,
"step": 390
},
{
"epoch": 0.97,
"grad_norm": 4987.634749508018,
"learning_rate": 1.2105981716597603e-09,
"logits/chosen": -2.7300946712493896,
"logits/rejected": -2.6389007568359375,
"logps/chosen": -0.9686774015426636,
"logps/rejected": -0.9328421354293823,
"loss": 3.7864,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": 0.6465551853179932,
"rewards/margins": 11.809611320495605,
"rewards/rejected": -11.163057327270508,
"step": 400
},
{
"epoch": 0.99,
"grad_norm": 5473.226219590305,
"learning_rate": 6.453139886395398e-11,
"logits/chosen": -2.7284317016601562,
"logits/rejected": -2.6886637210845947,
"logps/chosen": -0.9334842562675476,
"logps/rejected": -0.9600637555122375,
"loss": 3.6391,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.157397747039795,
"rewards/margins": 13.2835054397583,
"rewards/rejected": -15.440902709960938,
"step": 410
},
{
"epoch": 1.0,
"step": 413,
"total_flos": 0.0,
"train_loss": 3.8421780889894426,
"train_runtime": 6381.4933,
"train_samples_per_second": 8.293,
"train_steps_per_second": 0.065
}
],
"logging_steps": 10,
"max_steps": 413,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}