zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
b3f71f2 verified
raw
history blame
41.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9986824769433466,
"eval_steps": 100,
"global_step": 379,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 32.10821322863283,
"learning_rate": 1.3157894736842104e-08,
"logits/chosen": -2.219799041748047,
"logits/rejected": -2.229109525680542,
"logps/chosen": -269.856201171875,
"logps/rejected": -192.3697509765625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 35.006735982686784,
"learning_rate": 6.578947368421052e-08,
"logits/chosen": -2.276287078857422,
"logits/rejected": -2.2080204486846924,
"logps/chosen": -283.5397644042969,
"logps/rejected": -208.59442138671875,
"loss": 0.6931,
"rewards/accuracies": 0.40625,
"rewards/chosen": 0.0013411559630185366,
"rewards/margins": 0.00017823810048867017,
"rewards/rejected": 0.0011629178188741207,
"step": 5
},
{
"epoch": 0.03,
"grad_norm": 36.439618208906154,
"learning_rate": 1.3157894736842104e-07,
"logits/chosen": -2.3274893760681152,
"logits/rejected": -2.2322466373443604,
"logps/chosen": -286.6865539550781,
"logps/rejected": -206.5416717529297,
"loss": 0.6926,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.00261278566904366,
"rewards/margins": -3.606556128943339e-05,
"rewards/rejected": 0.0026488511357456446,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 34.873528802219326,
"learning_rate": 1.9736842105263157e-07,
"logits/chosen": -2.3293230533599854,
"logits/rejected": -2.2996506690979004,
"logps/chosen": -289.71771240234375,
"logps/rejected": -237.6280059814453,
"loss": 0.6906,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.017419874668121338,
"rewards/margins": 0.004653422627598047,
"rewards/rejected": 0.012766450643539429,
"step": 15
},
{
"epoch": 0.05,
"grad_norm": 31.898185632963774,
"learning_rate": 2.631578947368421e-07,
"logits/chosen": -2.3196263313293457,
"logits/rejected": -2.2592759132385254,
"logps/chosen": -263.24908447265625,
"logps/rejected": -202.05458068847656,
"loss": 0.6857,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.05536944791674614,
"rewards/margins": 0.01851554773747921,
"rewards/rejected": 0.03685389831662178,
"step": 20
},
{
"epoch": 0.07,
"grad_norm": 28.076450357377052,
"learning_rate": 3.2894736842105264e-07,
"logits/chosen": -2.3921802043914795,
"logits/rejected": -2.377410650253296,
"logps/chosen": -260.9800720214844,
"logps/rejected": -214.38821411132812,
"loss": 0.6763,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.13004128634929657,
"rewards/margins": 0.03864985704421997,
"rewards/rejected": 0.0913914293050766,
"step": 25
},
{
"epoch": 0.08,
"grad_norm": 25.341140024963043,
"learning_rate": 3.9473684210526315e-07,
"logits/chosen": -2.477496385574341,
"logits/rejected": -2.506354808807373,
"logps/chosen": -255.5835723876953,
"logps/rejected": -223.4532470703125,
"loss": 0.6664,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.2109464406967163,
"rewards/margins": 0.04907592386007309,
"rewards/rejected": 0.16187050938606262,
"step": 30
},
{
"epoch": 0.09,
"grad_norm": 23.40327238126855,
"learning_rate": 4.6052631578947365e-07,
"logits/chosen": -2.6838037967681885,
"logits/rejected": -2.6064510345458984,
"logps/chosen": -272.02825927734375,
"logps/rejected": -207.57565307617188,
"loss": 0.6576,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.3294451832771301,
"rewards/margins": 0.12016526609659195,
"rewards/rejected": 0.20927992463111877,
"step": 35
},
{
"epoch": 0.11,
"grad_norm": 21.272616898577162,
"learning_rate": 4.999575626062319e-07,
"logits/chosen": -2.747399091720581,
"logits/rejected": -2.7062249183654785,
"logps/chosen": -249.28085327148438,
"logps/rejected": -202.8720245361328,
"loss": 0.6504,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.3287069499492645,
"rewards/margins": 0.0975252240896225,
"rewards/rejected": 0.23118171095848083,
"step": 40
},
{
"epoch": 0.12,
"grad_norm": 18.3841620762551,
"learning_rate": 4.994803073715569e-07,
"logits/chosen": -2.788217067718506,
"logits/rejected": -2.736720561981201,
"logps/chosen": -260.8193359375,
"logps/rejected": -202.5912322998047,
"loss": 0.6444,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.4205988049507141,
"rewards/margins": 0.14394915103912354,
"rewards/rejected": 0.2766496539115906,
"step": 45
},
{
"epoch": 0.13,
"grad_norm": 18.656759383093647,
"learning_rate": 4.984737660598186e-07,
"logits/chosen": -2.8456408977508545,
"logits/rejected": -2.7947795391082764,
"logps/chosen": -223.0256805419922,
"logps/rejected": -197.88247680664062,
"loss": 0.6469,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.3571344316005707,
"rewards/margins": 0.09834496676921844,
"rewards/rejected": 0.2587894797325134,
"step": 50
},
{
"epoch": 0.14,
"grad_norm": 20.93264303618464,
"learning_rate": 4.969400741032999e-07,
"logits/chosen": -2.8368711471557617,
"logits/rejected": -2.8192451000213623,
"logps/chosen": -234.53515625,
"logps/rejected": -199.7665252685547,
"loss": 0.6341,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.33271390199661255,
"rewards/margins": 0.14281830191612244,
"rewards/rejected": 0.18989557027816772,
"step": 55
},
{
"epoch": 0.16,
"grad_norm": 19.192901766300064,
"learning_rate": 4.948824853131236e-07,
"logits/chosen": -2.8852925300598145,
"logits/rejected": -2.859867572784424,
"logps/chosen": -253.8678741455078,
"logps/rejected": -212.1206512451172,
"loss": 0.6257,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.3522695302963257,
"rewards/margins": 0.1734958440065384,
"rewards/rejected": 0.1787737011909485,
"step": 60
},
{
"epoch": 0.17,
"grad_norm": 19.6177452377838,
"learning_rate": 4.923053649761152e-07,
"logits/chosen": -2.8169569969177246,
"logits/rejected": -2.7849762439727783,
"logps/chosen": -244.50808715820312,
"logps/rejected": -201.7557830810547,
"loss": 0.6149,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.35878387093544006,
"rewards/margins": 0.19476714730262756,
"rewards/rejected": 0.1640167087316513,
"step": 65
},
{
"epoch": 0.18,
"grad_norm": 20.424079652700506,
"learning_rate": 4.892141805936084e-07,
"logits/chosen": -2.789267063140869,
"logits/rejected": -2.73822283744812,
"logps/chosen": -250.8653564453125,
"logps/rejected": -224.6582794189453,
"loss": 0.6149,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.4033736288547516,
"rewards/margins": 0.23386640846729279,
"rewards/rejected": 0.1695072501897812,
"step": 70
},
{
"epoch": 0.2,
"grad_norm": 25.663818502097516,
"learning_rate": 4.856154902818431e-07,
"logits/chosen": -2.8126118183135986,
"logits/rejected": -2.7708253860473633,
"logps/chosen": -229.9053955078125,
"logps/rejected": -206.87258911132812,
"loss": 0.6004,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.2887989580631256,
"rewards/margins": 0.22907009720802307,
"rewards/rejected": 0.05972885340452194,
"step": 75
},
{
"epoch": 0.21,
"grad_norm": 24.00962454367728,
"learning_rate": 4.81516928858564e-07,
"logits/chosen": -2.7702012062072754,
"logits/rejected": -2.7202858924865723,
"logps/chosen": -271.06317138671875,
"logps/rejected": -220.15475463867188,
"loss": 0.5964,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.3578353524208069,
"rewards/margins": 0.31964898109436035,
"rewards/rejected": 0.03818630054593086,
"step": 80
},
{
"epoch": 0.22,
"grad_norm": 26.200013285180134,
"learning_rate": 4.769271916453385e-07,
"logits/chosen": -2.817336320877075,
"logits/rejected": -2.7950100898742676,
"logps/chosen": -257.9813537597656,
"logps/rejected": -234.77108764648438,
"loss": 0.5692,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.16794440150260925,
"rewards/margins": 0.3747493624687195,
"rewards/rejected": -0.20680496096611023,
"step": 85
},
{
"epoch": 0.24,
"grad_norm": 31.71282371862922,
"learning_rate": 4.7185601601995784e-07,
"logits/chosen": -2.8321759700775146,
"logits/rejected": -2.8417580127716064,
"logps/chosen": -252.80313110351562,
"logps/rejected": -233.3778839111328,
"loss": 0.5452,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.17818805575370789,
"rewards/margins": 0.4816998839378357,
"rewards/rejected": -0.3035118579864502,
"step": 90
},
{
"epoch": 0.25,
"grad_norm": 30.203747907953968,
"learning_rate": 4.6631416075805886e-07,
"logits/chosen": -2.9183051586151123,
"logits/rejected": -2.8942832946777344,
"logps/chosen": -292.6710510253906,
"logps/rejected": -278.3623962402344,
"loss": 0.5293,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.0014784678351134062,
"rewards/margins": 0.45240503549575806,
"rewards/rejected": -0.4509265422821045,
"step": 95
},
{
"epoch": 0.26,
"grad_norm": 35.20356531841964,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": -3.0372838973999023,
"logits/rejected": -3.03391432762146,
"logps/chosen": -264.54412841796875,
"logps/rejected": -271.9236755371094,
"loss": 0.5292,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.24657364189624786,
"rewards/margins": 0.4146839678287506,
"rewards/rejected": -0.6612575650215149,
"step": 100
},
{
"epoch": 0.26,
"eval_logits/chosen": -2.981271743774414,
"eval_logits/rejected": -3.007460117340088,
"eval_logps/chosen": -500.8688049316406,
"eval_logps/rejected": -504.0709228515625,
"eval_loss": 0.6958277821540833,
"eval_rewards/accuracies": 0.53125,
"eval_rewards/chosen": -1.2149137258529663,
"eval_rewards/margins": 0.09968078136444092,
"eval_rewards/rejected": -1.3145945072174072,
"eval_runtime": 97.6692,
"eval_samples_per_second": 20.477,
"eval_steps_per_second": 0.328,
"step": 100
},
{
"epoch": 0.28,
"grad_norm": 35.37658079116714,
"learning_rate": 4.538664143459818e-07,
"logits/chosen": -3.1525657176971436,
"logits/rejected": -3.170300245285034,
"logps/chosen": -301.29736328125,
"logps/rejected": -306.07171630859375,
"loss": 0.4875,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.40715986490249634,
"rewards/margins": 0.6011335849761963,
"rewards/rejected": -1.0082933902740479,
"step": 105
},
{
"epoch": 0.29,
"grad_norm": 44.5542186040465,
"learning_rate": 4.4698693176863316e-07,
"logits/chosen": -3.1490285396575928,
"logits/rejected": -3.167726993560791,
"logps/chosen": -326.18792724609375,
"logps/rejected": -354.35784912109375,
"loss": 0.4645,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6683047413825989,
"rewards/margins": 0.806756854057312,
"rewards/rejected": -1.4750616550445557,
"step": 110
},
{
"epoch": 0.3,
"grad_norm": 46.11054779372386,
"learning_rate": 4.396895306731977e-07,
"logits/chosen": -3.2922375202178955,
"logits/rejected": -3.3002562522888184,
"logps/chosen": -316.46331787109375,
"logps/rejected": -344.7227783203125,
"loss": 0.478,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.5892685651779175,
"rewards/margins": 0.8242443203926086,
"rewards/rejected": -1.413512945175171,
"step": 115
},
{
"epoch": 0.32,
"grad_norm": 39.587491646799435,
"learning_rate": 4.319896928940505e-07,
"logits/chosen": -3.4349207878112793,
"logits/rejected": -3.4556515216827393,
"logps/chosen": -430.2960510253906,
"logps/rejected": -467.32061767578125,
"loss": 0.4502,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.621132254600525,
"rewards/margins": 0.7838995456695557,
"rewards/rejected": -2.405031681060791,
"step": 120
},
{
"epoch": 0.33,
"grad_norm": 36.84462226750656,
"learning_rate": 4.2390375405693723e-07,
"logits/chosen": -3.474997043609619,
"logits/rejected": -3.5272536277770996,
"logps/chosen": -415.80438232421875,
"logps/rejected": -468.0596618652344,
"loss": 0.4422,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5106348991394043,
"rewards/margins": 1.0294691324234009,
"rewards/rejected": -2.5401039123535156,
"step": 125
},
{
"epoch": 0.34,
"grad_norm": 47.09166107339635,
"learning_rate": 4.1544886892205354e-07,
"logits/chosen": -3.5425872802734375,
"logits/rejected": -3.5473670959472656,
"logps/chosen": -377.5634765625,
"logps/rejected": -425.98016357421875,
"loss": 0.4589,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8434383273124695,
"rewards/margins": 0.9450035095214844,
"rewards/rejected": -1.7884416580200195,
"step": 130
},
{
"epoch": 0.36,
"grad_norm": 49.77750455648269,
"learning_rate": 4.0664297498928534e-07,
"logits/chosen": -3.6749653816223145,
"logits/rejected": -3.6941752433776855,
"logps/chosen": -392.3956604003906,
"logps/rejected": -454.9180603027344,
"loss": 0.4261,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5022857189178467,
"rewards/margins": 1.0955121517181396,
"rewards/rejected": -2.5977978706359863,
"step": 135
},
{
"epoch": 0.37,
"grad_norm": 49.72958773280421,
"learning_rate": 3.975047544428254e-07,
"logits/chosen": -3.7381629943847656,
"logits/rejected": -3.768315076828003,
"logps/chosen": -459.23455810546875,
"logps/rejected": -531.2666625976562,
"loss": 0.4262,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8736200332641602,
"rewards/margins": 1.138641119003296,
"rewards/rejected": -3.012261152267456,
"step": 140
},
{
"epoch": 0.38,
"grad_norm": 37.39808184046548,
"learning_rate": 3.880535945158997e-07,
"logits/chosen": -3.7747676372528076,
"logits/rejected": -3.7997519969940186,
"logps/chosen": -367.59576416015625,
"logps/rejected": -449.857177734375,
"loss": 0.4127,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2651011943817139,
"rewards/margins": 1.1386528015136719,
"rewards/rejected": -2.403754234313965,
"step": 145
},
{
"epoch": 0.4,
"grad_norm": 47.294132664555036,
"learning_rate": 3.78309546359696e-07,
"logits/chosen": -3.9040164947509766,
"logits/rejected": -3.91438627243042,
"logps/chosen": -410.07159423828125,
"logps/rejected": -498.1920471191406,
"loss": 0.4235,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4813024997711182,
"rewards/margins": 1.1341646909713745,
"rewards/rejected": -2.6154673099517822,
"step": 150
},
{
"epoch": 0.41,
"grad_norm": 40.77943292899045,
"learning_rate": 3.6829328250375227e-07,
"logits/chosen": -3.932398557662964,
"logits/rejected": -4.020986080169678,
"logps/chosen": -427.6490783691406,
"logps/rejected": -508.22930908203125,
"loss": 0.3809,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.581094741821289,
"rewards/margins": 1.3298404216766357,
"rewards/rejected": -2.910935401916504,
"step": 155
},
{
"epoch": 0.42,
"grad_norm": 41.2026716936047,
"learning_rate": 3.580260529980584e-07,
"logits/chosen": -4.01393461227417,
"logits/rejected": -4.080017566680908,
"logps/chosen": -387.573974609375,
"logps/rejected": -487.1441345214844,
"loss": 0.3859,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2360130548477173,
"rewards/margins": 1.409085988998413,
"rewards/rejected": -2.64509916305542,
"step": 160
},
{
"epoch": 0.43,
"grad_norm": 40.85253910181408,
"learning_rate": 3.475296403299163e-07,
"logits/chosen": -4.092155456542969,
"logits/rejected": -4.100491523742676,
"logps/chosen": -388.4344482421875,
"logps/rejected": -494.84771728515625,
"loss": 0.391,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4383682012557983,
"rewards/margins": 1.5076179504394531,
"rewards/rejected": -2.945986032485962,
"step": 165
},
{
"epoch": 0.45,
"grad_norm": 43.630879623871635,
"learning_rate": 3.36826313211205e-07,
"logits/chosen": -4.23541784286499,
"logits/rejected": -4.378731727600098,
"logps/chosen": -423.3770446777344,
"logps/rejected": -521.7530517578125,
"loss": 0.3784,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7406959533691406,
"rewards/margins": 1.5629594326019287,
"rewards/rejected": -3.3036551475524902,
"step": 170
},
{
"epoch": 0.46,
"grad_norm": 40.910837008073884,
"learning_rate": 3.259387793340943e-07,
"logits/chosen": -4.390842437744141,
"logits/rejected": -4.466560363769531,
"logps/chosen": -482.41748046875,
"logps/rejected": -587.3478393554688,
"loss": 0.3845,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.141714096069336,
"rewards/margins": 1.4235177040100098,
"rewards/rejected": -3.5652313232421875,
"step": 175
},
{
"epoch": 0.47,
"grad_norm": 43.846997204513016,
"learning_rate": 3.14890137195437e-07,
"logits/chosen": -4.389448642730713,
"logits/rejected": -4.424112319946289,
"logps/chosen": -481.9625549316406,
"logps/rejected": -567.7194213867188,
"loss": 0.3682,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1195263862609863,
"rewards/margins": 1.2820372581481934,
"rewards/rejected": -3.4015636444091797,
"step": 180
},
{
"epoch": 0.49,
"grad_norm": 48.89695189111101,
"learning_rate": 3.0370382709204883e-07,
"logits/chosen": -4.416136264801025,
"logits/rejected": -4.519248962402344,
"logps/chosen": -472.10931396484375,
"logps/rejected": -609.0546264648438,
"loss": 0.3879,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -1.9735286235809326,
"rewards/margins": 1.7880016565322876,
"rewards/rejected": -3.7615303993225098,
"step": 185
},
{
"epoch": 0.5,
"grad_norm": 49.97288424600573,
"learning_rate": 2.9240358139084013e-07,
"logits/chosen": -4.548556804656982,
"logits/rejected": -4.62185001373291,
"logps/chosen": -421.85052490234375,
"logps/rejected": -534.3924560546875,
"loss": 0.3824,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.734278917312622,
"rewards/margins": 1.5305709838867188,
"rewards/rejected": -3.264849901199341,
"step": 190
},
{
"epoch": 0.51,
"grad_norm": 42.794361716855846,
"learning_rate": 2.810133741793052e-07,
"logits/chosen": -4.521183490753174,
"logits/rejected": -4.65042781829834,
"logps/chosen": -472.19500732421875,
"logps/rejected": -594.0242309570312,
"loss": 0.3875,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.0321810245513916,
"rewards/margins": 1.7259247303009033,
"rewards/rejected": -3.758105516433716,
"step": 195
},
{
"epoch": 0.53,
"grad_norm": 43.28984360773978,
"learning_rate": 2.695573704031885e-07,
"logits/chosen": -4.522828102111816,
"logits/rejected": -4.58953332901001,
"logps/chosen": -470.6817321777344,
"logps/rejected": -592.047119140625,
"loss": 0.3733,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.8410589694976807,
"rewards/margins": 1.6147441864013672,
"rewards/rejected": -3.4558029174804688,
"step": 200
},
{
"epoch": 0.53,
"eval_logits/chosen": -4.407002925872803,
"eval_logits/rejected": -4.5463128089904785,
"eval_logps/chosen": -804.1498413085938,
"eval_logps/rejected": -826.000732421875,
"eval_loss": 0.9587702751159668,
"eval_rewards/accuracies": 0.52734375,
"eval_rewards/chosen": -4.247724533081055,
"eval_rewards/margins": 0.28616809844970703,
"eval_rewards/rejected": -4.53389310836792,
"eval_runtime": 97.528,
"eval_samples_per_second": 20.507,
"eval_steps_per_second": 0.328,
"step": 200
},
{
"epoch": 0.54,
"grad_norm": 45.90321806356986,
"learning_rate": 2.580598745992342e-07,
"logits/chosen": -4.592051029205322,
"logits/rejected": -4.729592800140381,
"logps/chosen": -462.078857421875,
"logps/rejected": -578.6028442382812,
"loss": 0.3683,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6925300359725952,
"rewards/margins": 1.7680349349975586,
"rewards/rejected": -3.4605648517608643,
"step": 205
},
{
"epoch": 0.55,
"grad_norm": 45.69561704557737,
"learning_rate": 2.465452793317865e-07,
"logits/chosen": -4.680180549621582,
"logits/rejected": -4.785167694091797,
"logps/chosen": -477.2386779785156,
"logps/rejected": -609.8983154296875,
"loss": 0.3534,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8637679815292358,
"rewards/margins": 1.7234246730804443,
"rewards/rejected": -3.587193012237549,
"step": 210
},
{
"epoch": 0.57,
"grad_norm": 50.566593421325194,
"learning_rate": 2.3503801344263344e-07,
"logits/chosen": -4.860222339630127,
"logits/rejected": -4.944591045379639,
"logps/chosen": -456.22357177734375,
"logps/rejected": -573.6387939453125,
"loss": 0.3712,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.1773338317871094,
"rewards/margins": 1.6265901327133179,
"rewards/rejected": -3.8039238452911377,
"step": 215
},
{
"epoch": 0.58,
"grad_norm": 49.14997910435826,
"learning_rate": 2.2356249022388789e-07,
"logits/chosen": -4.710982322692871,
"logits/rejected": -4.878017425537109,
"logps/chosen": -468.77978515625,
"logps/rejected": -580.1436157226562,
"loss": 0.3706,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8084490299224854,
"rewards/margins": 1.884009599685669,
"rewards/rejected": -3.6924586296081543,
"step": 220
},
{
"epoch": 0.59,
"grad_norm": 48.71640334228914,
"learning_rate": 2.121430556238559e-07,
"logits/chosen": -4.8175554275512695,
"logits/rejected": -4.996689319610596,
"logps/chosen": -471.17132568359375,
"logps/rejected": -604.3876342773438,
"loss": 0.3386,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.008169412612915,
"rewards/margins": 1.8934684991836548,
"rewards/rejected": -3.9016380310058594,
"step": 225
},
{
"epoch": 0.61,
"grad_norm": 39.742825593785554,
"learning_rate": 2.0080393659578038e-07,
"logits/chosen": -4.740202903747559,
"logits/rejected": -5.004001617431641,
"logps/chosen": -513.0294189453125,
"logps/rejected": -656.8961181640625,
"loss": 0.3492,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.400604248046875,
"rewards/margins": 2.0355873107910156,
"rewards/rejected": -4.436191558837891,
"step": 230
},
{
"epoch": 0.62,
"grad_norm": 50.20217799621439,
"learning_rate": 1.895691896990388e-07,
"logits/chosen": -4.822530269622803,
"logits/rejected": -4.9560699462890625,
"logps/chosen": -474.0023498535156,
"logps/rejected": -579.5379638671875,
"loss": 0.3354,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.1355338096618652,
"rewards/margins": 1.5200008153915405,
"rewards/rejected": -3.655534267425537,
"step": 235
},
{
"epoch": 0.63,
"grad_norm": 43.0245343621866,
"learning_rate": 1.7846265006183976e-07,
"logits/chosen": -4.6783013343811035,
"logits/rejected": -4.8442277908325195,
"logps/chosen": -524.712158203125,
"logps/rejected": -632.6328125,
"loss": 0.3455,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.490675687789917,
"rewards/margins": 1.6402429342269897,
"rewards/rejected": -4.130918979644775,
"step": 240
},
{
"epoch": 0.65,
"grad_norm": 50.41005236551354,
"learning_rate": 1.6750788081369948e-07,
"logits/chosen": -4.760382175445557,
"logits/rejected": -4.9731292724609375,
"logps/chosen": -475.59765625,
"logps/rejected": -638.8780517578125,
"loss": 0.3533,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.0123469829559326,
"rewards/margins": 2.291203737258911,
"rewards/rejected": -4.303550720214844,
"step": 245
},
{
"epoch": 0.66,
"grad_norm": 41.2044329360216,
"learning_rate": 1.5672812309497722e-07,
"logits/chosen": -4.802388668060303,
"logits/rejected": -4.928206443786621,
"logps/chosen": -410.943115234375,
"logps/rejected": -523.5797119140625,
"loss": 0.3523,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.6920316219329834,
"rewards/margins": 1.4624649286270142,
"rewards/rejected": -3.154496669769287,
"step": 250
},
{
"epoch": 0.67,
"grad_norm": 44.69671736661672,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": -4.758819580078125,
"logits/rejected": -4.947306156158447,
"logps/chosen": -495.4164123535156,
"logps/rejected": -593.2689208984375,
"loss": 0.3703,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.218562602996826,
"rewards/margins": 1.439822793006897,
"rewards/rejected": -3.6583850383758545,
"step": 255
},
{
"epoch": 0.69,
"grad_norm": 47.815462029236265,
"learning_rate": 1.357847018050843e-07,
"logits/chosen": -4.747325420379639,
"logits/rejected": -4.958649158477783,
"logps/chosen": -592.2781372070312,
"logps/rejected": -758.1558837890625,
"loss": 0.3513,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.605052947998047,
"rewards/margins": 2.2088265419006348,
"rewards/rejected": -4.81387996673584,
"step": 260
},
{
"epoch": 0.7,
"grad_norm": 49.07485622430381,
"learning_rate": 1.2566547084429324e-07,
"logits/chosen": -4.789057731628418,
"logits/rejected": -5.023972988128662,
"logps/chosen": -472.7372131347656,
"logps/rejected": -615.0753784179688,
"loss": 0.3591,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2092373371124268,
"rewards/margins": 1.8947007656097412,
"rewards/rejected": -4.103938579559326,
"step": 265
},
{
"epoch": 0.71,
"grad_norm": 45.44279309151153,
"learning_rate": 1.1581002236747328e-07,
"logits/chosen": -4.7014241218566895,
"logits/rejected": -4.898508548736572,
"logps/chosen": -470.0,
"logps/rejected": -624.503662109375,
"loss": 0.368,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.254678249359131,
"rewards/margins": 2.110576868057251,
"rewards/rejected": -4.365254878997803,
"step": 270
},
{
"epoch": 0.72,
"grad_norm": 53.087469451573526,
"learning_rate": 1.062392652460177e-07,
"logits/chosen": -4.696314811706543,
"logits/rejected": -4.83748197555542,
"logps/chosen": -505.9961853027344,
"logps/rejected": -626.0050048828125,
"loss": 0.3245,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.9974615573883057,
"rewards/margins": 1.828150987625122,
"rewards/rejected": -3.8256123065948486,
"step": 275
},
{
"epoch": 0.74,
"grad_norm": 41.5564187525106,
"learning_rate": 9.697350436308427e-08,
"logits/chosen": -4.737555027008057,
"logits/rejected": -4.948160648345947,
"logps/chosen": -507.66607666015625,
"logps/rejected": -594.7119750976562,
"loss": 0.3584,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.3149771690368652,
"rewards/margins": 1.4441829919815063,
"rewards/rejected": -3.759159803390503,
"step": 280
},
{
"epoch": 0.75,
"grad_norm": 44.45010163032659,
"learning_rate": 8.803239753567829e-08,
"logits/chosen": -4.868664741516113,
"logits/rejected": -4.975742340087891,
"logps/chosen": -469.70263671875,
"logps/rejected": -589.5235595703125,
"loss": 0.3498,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.1795949935913086,
"rewards/margins": 1.7358201742172241,
"rewards/rejected": -3.9154155254364014,
"step": 285
},
{
"epoch": 0.76,
"grad_norm": 38.027371906598745,
"learning_rate": 7.943491380952188e-08,
"logits/chosen": -4.954745292663574,
"logits/rejected": -5.065755844116211,
"logps/chosen": -429.4737243652344,
"logps/rejected": -571.6141967773438,
"loss": 0.3285,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.8640539646148682,
"rewards/margins": 1.8129631280899048,
"rewards/rejected": -3.6770172119140625,
"step": 290
},
{
"epoch": 0.78,
"grad_norm": 47.705363932547336,
"learning_rate": 7.119929321518875e-08,
"logits/chosen": -4.735751152038574,
"logits/rejected": -5.044283866882324,
"logps/chosen": -445.70367431640625,
"logps/rejected": -583.5177612304688,
"loss": 0.3436,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.9234256744384766,
"rewards/margins": 1.9092767238616943,
"rewards/rejected": -3.832702159881592,
"step": 295
},
{
"epoch": 0.79,
"grad_norm": 52.02157401864257,
"learning_rate": 6.334300807088508e-08,
"logits/chosen": -4.835855007171631,
"logits/rejected": -5.021653652191162,
"logps/chosen": -518.7296142578125,
"logps/rejected": -639.1578369140625,
"loss": 0.3689,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.530418634414673,
"rewards/margins": 1.8332151174545288,
"rewards/rejected": -4.363633155822754,
"step": 300
},
{
"epoch": 0.79,
"eval_logits/chosen": -4.618500709533691,
"eval_logits/rejected": -4.810915946960449,
"eval_logps/chosen": -895.0606079101562,
"eval_logps/rejected": -917.572265625,
"eval_loss": 1.0593960285186768,
"eval_rewards/accuracies": 0.5625,
"eval_rewards/chosen": -5.156832218170166,
"eval_rewards/margins": 0.2927757203578949,
"eval_rewards/rejected": -5.449607849121094,
"eval_runtime": 97.5523,
"eval_samples_per_second": 20.502,
"eval_steps_per_second": 0.328,
"step": 300
},
{
"epoch": 0.8,
"grad_norm": 50.197907161154795,
"learning_rate": 5.588272591397336e-08,
"logits/chosen": -4.786068916320801,
"logits/rejected": -4.9945068359375,
"logps/chosen": -485.31280517578125,
"logps/rejected": -626.9613647460938,
"loss": 0.3457,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.227161169052124,
"rewards/margins": 2.0316004753112793,
"rewards/rejected": -4.258761405944824,
"step": 305
},
{
"epoch": 0.82,
"grad_norm": 48.34581690217935,
"learning_rate": 4.8834274139883084e-08,
"logits/chosen": -4.855401039123535,
"logits/rejected": -5.10861873626709,
"logps/chosen": -469.53704833984375,
"logps/rejected": -616.6036376953125,
"loss": 0.3427,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.20617413520813,
"rewards/margins": 1.9504497051239014,
"rewards/rejected": -4.156623840332031,
"step": 310
},
{
"epoch": 0.83,
"grad_norm": 43.92355028448363,
"learning_rate": 4.221260642342786e-08,
"logits/chosen": -4.848982810974121,
"logits/rejected": -4.972110271453857,
"logps/chosen": -502.3720703125,
"logps/rejected": -609.923828125,
"loss": 0.3357,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.262599229812622,
"rewards/margins": 1.6350996494293213,
"rewards/rejected": -3.8976986408233643,
"step": 315
},
{
"epoch": 0.84,
"grad_norm": 48.906535443168416,
"learning_rate": 3.60317709937693e-08,
"logits/chosen": -4.723662853240967,
"logits/rejected": -4.999013900756836,
"logps/chosen": -511.27691650390625,
"logps/rejected": -656.3660888671875,
"loss": 0.361,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.2201476097106934,
"rewards/margins": 2.0478897094726562,
"rewards/rejected": -4.26803731918335,
"step": 320
},
{
"epoch": 0.86,
"grad_norm": 38.024316440823085,
"learning_rate": 3.030488083033273e-08,
"logits/chosen": -4.8308892250061035,
"logits/rejected": -5.002086162567139,
"logps/chosen": -500.25579833984375,
"logps/rejected": -703.7189331054688,
"loss": 0.3434,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2706263065338135,
"rewards/margins": 2.5099949836730957,
"rewards/rejected": -4.780620574951172,
"step": 325
},
{
"epoch": 0.87,
"grad_norm": 46.39394071999461,
"learning_rate": 2.5044085842905683e-08,
"logits/chosen": -4.880900859832764,
"logits/rejected": -5.102107048034668,
"logps/chosen": -522.2579345703125,
"logps/rejected": -679.2675170898438,
"loss": 0.375,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.569988965988159,
"rewards/margins": 2.0331876277923584,
"rewards/rejected": -4.603176593780518,
"step": 330
},
{
"epoch": 0.88,
"grad_norm": 44.94685644229088,
"learning_rate": 2.0260547094942348e-08,
"logits/chosen": -4.779486656188965,
"logits/rejected": -4.989696502685547,
"logps/chosen": -506.99920654296875,
"logps/rejected": -654.0205688476562,
"loss": 0.3649,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.142956495285034,
"rewards/margins": 2.1117589473724365,
"rewards/rejected": -4.254715919494629,
"step": 335
},
{
"epoch": 0.9,
"grad_norm": 44.46821814832556,
"learning_rate": 1.5964413124758493e-08,
"logits/chosen": -4.752711296081543,
"logits/rejected": -4.939455986022949,
"logps/chosen": -438.1668395996094,
"logps/rejected": -598.4129028320312,
"loss": 0.3357,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.8868913650512695,
"rewards/margins": 1.9845082759857178,
"rewards/rejected": -3.8714001178741455,
"step": 340
},
{
"epoch": 0.91,
"grad_norm": 49.902887693125585,
"learning_rate": 1.2164798414854071e-08,
"logits/chosen": -4.861344814300537,
"logits/rejected": -4.92690896987915,
"logps/chosen": -514.7852783203125,
"logps/rejected": -692.4171142578125,
"loss": 0.3343,
"rewards/accuracies": 0.8125,
"rewards/chosen": -2.289367437362671,
"rewards/margins": 2.1290056705474854,
"rewards/rejected": -4.418373107910156,
"step": 345
},
{
"epoch": 0.92,
"grad_norm": 46.82416143070829,
"learning_rate": 8.869764055041501e-09,
"logits/chosen": -4.845822811126709,
"logits/rejected": -5.040514945983887,
"logps/chosen": -476.73358154296875,
"logps/rejected": -638.4616088867188,
"loss": 0.3795,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.128450870513916,
"rewards/margins": 1.8921457529067993,
"rewards/rejected": -4.020596504211426,
"step": 350
},
{
"epoch": 0.94,
"grad_norm": 45.59470033908034,
"learning_rate": 6.086300640404079e-09,
"logits/chosen": -4.7604827880859375,
"logits/rejected": -4.900928020477295,
"logps/chosen": -530.9966430664062,
"logps/rejected": -647.1216430664062,
"loss": 0.3474,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.3973419666290283,
"rewards/margins": 1.63766610622406,
"rewards/rejected": -4.035007953643799,
"step": 355
},
{
"epoch": 0.95,
"grad_norm": 38.33546651783018,
"learning_rate": 3.82031344036729e-09,
"logits/chosen": -4.72461462020874,
"logits/rejected": -4.975947380065918,
"logps/chosen": -490.10650634765625,
"logps/rejected": -617.1689453125,
"loss": 0.3241,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.2612645626068115,
"rewards/margins": 1.7519699335098267,
"rewards/rejected": -4.013234615325928,
"step": 360
},
{
"epoch": 0.96,
"grad_norm": 48.183739221855284,
"learning_rate": 2.0766098703477173e-09,
"logits/chosen": -4.829585552215576,
"logits/rejected": -5.007233619689941,
"logps/chosen": -458.9842834472656,
"logps/rejected": -589.1207275390625,
"loss": 0.3698,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.9293123483657837,
"rewards/margins": 1.888193130493164,
"rewards/rejected": -3.8175055980682373,
"step": 365
},
{
"epoch": 0.97,
"grad_norm": 47.43892343620843,
"learning_rate": 8.588892925590063e-10,
"logits/chosen": -4.886306285858154,
"logits/rejected": -5.221497535705566,
"logps/chosen": -480.17462158203125,
"logps/rejected": -637.3043823242188,
"loss": 0.3276,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.0167715549468994,
"rewards/margins": 2.319883346557617,
"rewards/rejected": -4.3366546630859375,
"step": 370
},
{
"epoch": 0.99,
"grad_norm": 48.471497735173386,
"learning_rate": 1.6973516761317752e-10,
"logits/chosen": -4.912912845611572,
"logits/rejected": -5.170907020568848,
"logps/chosen": -459.01531982421875,
"logps/rejected": -576.1976318359375,
"loss": 0.3505,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.2589094638824463,
"rewards/margins": 1.6588354110717773,
"rewards/rejected": -3.9177448749542236,
"step": 375
},
{
"epoch": 1.0,
"step": 379,
"total_flos": 0.0,
"train_loss": 0.4399322837512537,
"train_runtime": 5833.2156,
"train_samples_per_second": 8.32,
"train_steps_per_second": 0.065
}
],
"logging_steps": 5,
"max_steps": 379,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}