zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
48d9805 verified
raw
history blame
No virus
21.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9980806142034548,
"eval_steps": 10000000,
"global_step": 390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1135.2510024076782,
"learning_rate": 1.282051282051282e-08,
"logits/chosen": -2.5583817958831787,
"logits/rejected": -2.4487552642822266,
"logps/chosen": -258.1644592285156,
"logps/rejected": -216.25729370117188,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.03,
"grad_norm": 1064.195577422658,
"learning_rate": 1.2820512820512818e-07,
"logits/chosen": -2.606004476547241,
"logits/rejected": -2.553109884262085,
"logps/chosen": -267.5234680175781,
"logps/rejected": -217.6415557861328,
"loss": 0.7054,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 0.03280753642320633,
"rewards/margins": 0.0353083573281765,
"rewards/rejected": -0.002500815549865365,
"step": 10
},
{
"epoch": 0.05,
"grad_norm": 736.2634036624544,
"learning_rate": 2.5641025641025636e-07,
"logits/chosen": -2.630505323410034,
"logits/rejected": -2.5676522254943848,
"logps/chosen": -260.584716796875,
"logps/rejected": -207.07144165039062,
"loss": 0.5213,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.5755742788314819,
"rewards/margins": 0.5894275903701782,
"rewards/rejected": -0.013853324577212334,
"step": 20
},
{
"epoch": 0.08,
"grad_norm": 1076.3695793406284,
"learning_rate": 3.8461538461538463e-07,
"logits/chosen": -2.6462178230285645,
"logits/rejected": -2.571561336517334,
"logps/chosen": -250.9139862060547,
"logps/rejected": -198.4534912109375,
"loss": 0.3324,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 3.3866074085235596,
"rewards/margins": 3.0545947551727295,
"rewards/rejected": 0.3320125639438629,
"step": 30
},
{
"epoch": 0.1,
"grad_norm": 418.3228099023361,
"learning_rate": 4.99989986344963e-07,
"logits/chosen": -2.6392903327941895,
"logits/rejected": -2.5602712631225586,
"logps/chosen": -243.54013061523438,
"logps/rejected": -192.9114227294922,
"loss": 0.3161,
"rewards/accuracies": 0.84375,
"rewards/chosen": 5.447351455688477,
"rewards/margins": 4.827452182769775,
"rewards/rejected": 0.6198989748954773,
"step": 40
},
{
"epoch": 0.13,
"grad_norm": 630.2703390024756,
"learning_rate": 4.987893180827479e-07,
"logits/chosen": -2.651214361190796,
"logits/rejected": -2.57964825630188,
"logps/chosen": -258.42962646484375,
"logps/rejected": -203.57992553710938,
"loss": 0.366,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 7.846033573150635,
"rewards/margins": 6.590806007385254,
"rewards/rejected": 1.255226731300354,
"step": 50
},
{
"epoch": 0.15,
"grad_norm": 655.8352889546771,
"learning_rate": 4.955969343539162e-07,
"logits/chosen": -2.60957932472229,
"logits/rejected": -2.5362067222595215,
"logps/chosen": -262.3640441894531,
"logps/rejected": -209.32199096679688,
"loss": 0.3453,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 5.170942306518555,
"rewards/margins": 6.18172550201416,
"rewards/rejected": -1.0107834339141846,
"step": 60
},
{
"epoch": 0.18,
"grad_norm": 456.9589116841801,
"learning_rate": 4.90438392204474e-07,
"logits/chosen": -2.5825228691101074,
"logits/rejected": -2.5089833736419678,
"logps/chosen": -291.7918395996094,
"logps/rejected": -227.83432006835938,
"loss": 0.3454,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 4.895013809204102,
"rewards/margins": 7.00995397567749,
"rewards/rejected": -2.1149401664733887,
"step": 70
},
{
"epoch": 0.2,
"grad_norm": 816.8720109326792,
"learning_rate": 4.83354989019146e-07,
"logits/chosen": -2.5420753955841064,
"logits/rejected": -2.467258930206299,
"logps/chosen": -259.6270446777344,
"logps/rejected": -204.15179443359375,
"loss": 0.3311,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 6.344871997833252,
"rewards/margins": 7.2052764892578125,
"rewards/rejected": -0.860403835773468,
"step": 80
},
{
"epoch": 0.23,
"grad_norm": 922.6738539012168,
"learning_rate": 4.7440343190975353e-07,
"logits/chosen": -2.5713560581207275,
"logits/rejected": -2.513441801071167,
"logps/chosen": -257.0751037597656,
"logps/rejected": -217.1184844970703,
"loss": 0.3343,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 3.73614239692688,
"rewards/margins": 5.834546089172363,
"rewards/rejected": -2.0984034538269043,
"step": 90
},
{
"epoch": 0.26,
"grad_norm": 406.82707972381877,
"learning_rate": 4.6365538373900506e-07,
"logits/chosen": -2.6249356269836426,
"logits/rejected": -2.5500850677490234,
"logps/chosen": -236.4239501953125,
"logps/rejected": -200.73150634765625,
"loss": 0.5974,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 4.595959663391113,
"rewards/margins": 6.244544506072998,
"rewards/rejected": -1.648585557937622,
"step": 100
},
{
"epoch": 0.28,
"grad_norm": 656.3071663391811,
"learning_rate": 4.5119688941406386e-07,
"logits/chosen": -2.618974208831787,
"logits/rejected": -2.5380780696868896,
"logps/chosen": -257.79248046875,
"logps/rejected": -209.8715362548828,
"loss": 0.4404,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 6.305555820465088,
"rewards/margins": 7.463587760925293,
"rewards/rejected": -1.158031940460205,
"step": 110
},
{
"epoch": 0.31,
"grad_norm": 810.7648282749318,
"learning_rate": 4.3712768704277524e-07,
"logits/chosen": -2.5895957946777344,
"logits/rejected": -2.519530773162842,
"logps/chosen": -262.7950134277344,
"logps/rejected": -208.9604949951172,
"loss": 0.438,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 5.333884239196777,
"rewards/margins": 7.409175872802734,
"rewards/rejected": -2.075291156768799,
"step": 120
},
{
"epoch": 0.33,
"grad_norm": 513.4959841183485,
"learning_rate": 4.2156040946718343e-07,
"logits/chosen": -2.5553436279296875,
"logits/rejected": -2.487457752227783,
"logps/chosen": -251.7507781982422,
"logps/rejected": -197.44088745117188,
"loss": 0.4027,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 4.441976070404053,
"rewards/margins": 7.408116340637207,
"rewards/rejected": -2.966140031814575,
"step": 130
},
{
"epoch": 0.36,
"grad_norm": 650.5511601275197,
"learning_rate": 4.046196825665637e-07,
"logits/chosen": -2.5706536769866943,
"logits/rejected": -2.500262498855591,
"logps/chosen": -270.2043762207031,
"logps/rejected": -217.0515594482422,
"loss": 0.4293,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 3.977551221847534,
"rewards/margins": 6.7731499671936035,
"rewards/rejected": -2.7955987453460693,
"step": 140
},
{
"epoch": 0.38,
"grad_norm": 530.5799871161138,
"learning_rate": 3.864411275486261e-07,
"logits/chosen": -2.5574281215667725,
"logits/rejected": -2.488007068634033,
"logps/chosen": -263.3489685058594,
"logps/rejected": -212.54638671875,
"loss": 0.4583,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 5.79421329498291,
"rewards/margins": 7.515044212341309,
"rewards/rejected": -1.720831274986267,
"step": 150
},
{
"epoch": 0.41,
"grad_norm": 600.6086946072276,
"learning_rate": 3.671702752161759e-07,
"logits/chosen": -2.563870906829834,
"logits/rejected": -2.493649482727051,
"logps/chosen": -244.5281219482422,
"logps/rejected": -198.3011474609375,
"loss": 0.4465,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 3.527863025665283,
"rewards/margins": 7.751578330993652,
"rewards/rejected": -4.223715782165527,
"step": 160
},
{
"epoch": 0.44,
"grad_norm": 753.6856997505446,
"learning_rate": 3.4696140090121375e-07,
"logits/chosen": -2.5673775672912598,
"logits/rejected": -2.500842571258545,
"logps/chosen": -265.5797119140625,
"logps/rejected": -211.0306854248047,
"loss": 0.3547,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 4.089644908905029,
"rewards/margins": 7.812180519104004,
"rewards/rejected": -3.7225348949432373,
"step": 170
},
{
"epoch": 0.46,
"grad_norm": 645.3967547220625,
"learning_rate": 3.259762893935617e-07,
"logits/chosen": -2.6238903999328613,
"logits/rejected": -2.534097194671631,
"logps/chosen": -236.9849395751953,
"logps/rejected": -186.6522674560547,
"loss": 0.4499,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 4.038764953613281,
"rewards/margins": 6.760235786437988,
"rewards/rejected": -2.721470594406128,
"step": 180
},
{
"epoch": 0.49,
"grad_norm": 707.705744532387,
"learning_rate": 3.0438293975154184e-07,
"logits/chosen": -2.582486867904663,
"logits/rejected": -2.5034093856811523,
"logps/chosen": -261.0556945800781,
"logps/rejected": -205.6962890625,
"loss": 0.3591,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 3.011924982070923,
"rewards/margins": 8.104998588562012,
"rewards/rejected": -5.093073844909668,
"step": 190
},
{
"epoch": 0.51,
"grad_norm": 822.8629977119059,
"learning_rate": 2.823542203635138e-07,
"logits/chosen": -2.615396499633789,
"logits/rejected": -2.5223731994628906,
"logps/chosen": -277.3884582519531,
"logps/rejected": -221.803466796875,
"loss": 0.4468,
"rewards/accuracies": 0.875,
"rewards/chosen": 2.5077309608459473,
"rewards/margins": 9.055838584899902,
"rewards/rejected": -6.548108100891113,
"step": 200
},
{
"epoch": 0.54,
"grad_norm": 954.7034527431528,
"learning_rate": 2.600664850273538e-07,
"logits/chosen": -2.603569269180298,
"logits/rejected": -2.5283331871032715,
"logps/chosen": -269.19873046875,
"logps/rejected": -213.823974609375,
"loss": 0.6013,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 1.4840681552886963,
"rewards/margins": 7.454611778259277,
"rewards/rejected": -5.97054386138916,
"step": 210
},
{
"epoch": 0.56,
"grad_norm": 685.0955562473252,
"learning_rate": 2.3769816112703045e-07,
"logits/chosen": -2.6224589347839355,
"logits/rejected": -2.55679988861084,
"logps/chosen": -257.71661376953125,
"logps/rejected": -214.28329467773438,
"loss": 0.4806,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 3.168187141418457,
"rewards/margins": 6.781345367431641,
"rewards/rejected": -3.6131577491760254,
"step": 220
},
{
"epoch": 0.59,
"grad_norm": 568.8894162951807,
"learning_rate": 2.1542832120881677e-07,
"logits/chosen": -2.664320945739746,
"logits/rejected": -2.5764544010162354,
"logps/chosen": -266.98114013671875,
"logps/rejected": -216.44894409179688,
"loss": 0.4149,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 5.137583255767822,
"rewards/margins": 7.965329647064209,
"rewards/rejected": -2.827746629714966,
"step": 230
},
{
"epoch": 0.61,
"grad_norm": 923.147651672606,
"learning_rate": 1.934352493925695e-07,
"logits/chosen": -2.6468780040740967,
"logits/rejected": -2.5980067253112793,
"logps/chosen": -262.94610595703125,
"logps/rejected": -220.69448852539062,
"loss": 0.3991,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 3.828115463256836,
"rewards/margins": 9.526643753051758,
"rewards/rejected": -5.69852876663208,
"step": 240
},
{
"epoch": 0.64,
"grad_norm": 615.4120078013015,
"learning_rate": 1.7189501409486059e-07,
"logits/chosen": -2.656362533569336,
"logits/rejected": -2.584864616394043,
"logps/chosen": -267.7325439453125,
"logps/rejected": -222.2632293701172,
"loss": 0.4004,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 3.2369320392608643,
"rewards/margins": 7.942319393157959,
"rewards/rejected": -4.705387115478516,
"step": 250
},
{
"epoch": 0.67,
"grad_norm": 488.0068782741624,
"learning_rate": 1.5098005849021078e-07,
"logits/chosen": -2.64605450630188,
"logits/rejected": -2.586585283279419,
"logps/chosen": -261.89093017578125,
"logps/rejected": -208.77493286132812,
"loss": 0.3817,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 3.175231456756592,
"rewards/margins": 7.630448818206787,
"rewards/rejected": -4.455216884613037,
"step": 260
},
{
"epoch": 0.69,
"grad_norm": 559.4430135222711,
"learning_rate": 1.30857819994673e-07,
"logits/chosen": -2.6208698749542236,
"logits/rejected": -2.5371921062469482,
"logps/chosen": -274.78753662109375,
"logps/rejected": -230.4307861328125,
"loss": 0.5355,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 1.437089204788208,
"rewards/margins": 9.265036582946777,
"rewards/rejected": -7.82794713973999,
"step": 270
},
{
"epoch": 0.72,
"grad_norm": 432.8210354095987,
"learning_rate": 1.116893898236716e-07,
"logits/chosen": -2.654949426651001,
"logits/rejected": -2.5985524654388428,
"logps/chosen": -270.3836975097656,
"logps/rejected": -219.8002471923828,
"loss": 0.3718,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.6581013202667236,
"rewards/margins": 8.142509460449219,
"rewards/rejected": -5.484408378601074,
"step": 280
},
{
"epoch": 0.74,
"grad_norm": 482.2442984028295,
"learning_rate": 9.362822335518062e-08,
"logits/chosen": -2.6166903972625732,
"logits/rejected": -2.5696167945861816,
"logps/chosen": -268.19140625,
"logps/rejected": -216.9479522705078,
"loss": 0.3568,
"rewards/accuracies": 0.875,
"rewards/chosen": 3.0037200450897217,
"rewards/margins": 7.667593479156494,
"rewards/rejected": -4.663873195648193,
"step": 290
},
{
"epoch": 0.77,
"grad_norm": 492.9163861530474,
"learning_rate": 7.681891162260015e-08,
"logits/chosen": -2.636460781097412,
"logits/rejected": -2.580770254135132,
"logps/chosen": -274.6198425292969,
"logps/rejected": -220.8531951904297,
"loss": 0.3983,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 3.163914680480957,
"rewards/margins": 7.829231262207031,
"rewards/rejected": -4.665315628051758,
"step": 300
},
{
"epoch": 0.79,
"grad_norm": 437.9917779014462,
"learning_rate": 6.139602377230247e-08,
"logits/chosen": -2.6010611057281494,
"logits/rejected": -2.532543897628784,
"logps/chosen": -278.3953552246094,
"logps/rejected": -215.9014129638672,
"loss": 0.4376,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 3.1028757095336914,
"rewards/margins": 8.152434349060059,
"rewards/rejected": -5.049559593200684,
"step": 310
},
{
"epoch": 0.82,
"grad_norm": 649.8222699481745,
"learning_rate": 4.748302975270837e-08,
"logits/chosen": -2.6264309883117676,
"logits/rejected": -2.5793588161468506,
"logps/chosen": -261.37890625,
"logps/rejected": -204.51773071289062,
"loss": 0.405,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.8262996673583984,
"rewards/margins": 7.375731468200684,
"rewards/rejected": -4.549432277679443,
"step": 320
},
{
"epoch": 0.84,
"grad_norm": 583.9617574483902,
"learning_rate": 3.5191311859445795e-08,
"logits/chosen": -2.6449975967407227,
"logits/rejected": -2.586719512939453,
"logps/chosen": -264.58428955078125,
"logps/rejected": -217.4517364501953,
"loss": 0.3924,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 4.091521263122559,
"rewards/margins": 7.869417667388916,
"rewards/rejected": -3.7778968811035156,
"step": 330
},
{
"epoch": 0.87,
"grad_norm": 516.763098966226,
"learning_rate": 2.4619273049795996e-08,
"logits/chosen": -2.631946563720703,
"logits/rejected": -2.5740180015563965,
"logps/chosen": -260.0722961425781,
"logps/rejected": -210.775146484375,
"loss": 0.3558,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 3.791111469268799,
"rewards/margins": 8.514566421508789,
"rewards/rejected": -4.723455905914307,
"step": 340
},
{
"epoch": 0.9,
"grad_norm": 434.316228593937,
"learning_rate": 1.5851549164932115e-08,
"logits/chosen": -2.641859531402588,
"logits/rejected": -2.592379093170166,
"logps/chosen": -269.5948181152344,
"logps/rejected": -226.536865234375,
"loss": 0.382,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 3.7248435020446777,
"rewards/margins": 7.7656402587890625,
"rewards/rejected": -4.040797233581543,
"step": 350
},
{
"epoch": 0.92,
"grad_norm": 570.6334718025578,
"learning_rate": 8.958331366609423e-09,
"logits/chosen": -2.6432430744171143,
"logits/rejected": -2.574936628341675,
"logps/chosen": -275.0256652832031,
"logps/rejected": -219.6584014892578,
"loss": 0.4253,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 3.3530006408691406,
"rewards/margins": 8.096589088439941,
"rewards/rejected": -4.743588447570801,
"step": 360
},
{
"epoch": 0.95,
"grad_norm": 877.4134874498682,
"learning_rate": 3.994804212627461e-09,
"logits/chosen": -2.6024394035339355,
"logits/rejected": -2.5662083625793457,
"logps/chosen": -273.9478454589844,
"logps/rejected": -229.1957550048828,
"loss": 0.4977,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 4.258389472961426,
"rewards/margins": 7.956662178039551,
"rewards/rejected": -3.698272705078125,
"step": 370
},
{
"epoch": 0.97,
"grad_norm": 416.60583937652194,
"learning_rate": 1.0007038696262516e-09,
"logits/chosen": -2.651128053665161,
"logits/rejected": -2.610159397125244,
"logps/chosen": -263.07269287109375,
"logps/rejected": -230.61502075195312,
"loss": 0.3902,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 4.250136375427246,
"rewards/margins": 8.099352836608887,
"rewards/rejected": -3.8492164611816406,
"step": 380
},
{
"epoch": 1.0,
"grad_norm": 678.8175373396961,
"learning_rate": 0.0,
"logits/chosen": -2.6594204902648926,
"logits/rejected": -2.5979819297790527,
"logps/chosen": -250.8957977294922,
"logps/rejected": -210.31497192382812,
"loss": 0.4132,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 3.278926134109497,
"rewards/margins": 7.683538913726807,
"rewards/rejected": -4.4046125411987305,
"step": 390
},
{
"epoch": 1.0,
"step": 390,
"total_flos": 0.0,
"train_loss": 0.4220164916454217,
"train_runtime": 5868.9984,
"train_samples_per_second": 8.519,
"train_steps_per_second": 0.066
}
],
"logging_steps": 10,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}