zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
2de7880 verified
raw history blame
No virus
27.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 49.891043665102934,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.7660439014434814,
"logits/rejected": -2.717564582824707,
"logps/chosen": -269.8568420410156,
"logps/rejected": -360.52459716796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 46.946091297352105,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.592543125152588,
"logits/rejected": -2.56319522857666,
"logps/chosen": -264.7040100097656,
"logps/rejected": -251.515625,
"loss": 0.6933,
"rewards/accuracies": 0.4791666567325592,
"rewards/chosen": 0.004693002440035343,
"rewards/margins": 0.0028277651872485876,
"rewards/rejected": 0.0018652371363714337,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 41.817724108185395,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.65449595451355,
"logits/rejected": -2.6068952083587646,
"logps/chosen": -280.5221252441406,
"logps/rejected": -295.92376708984375,
"loss": 0.689,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.05156273767352104,
"rewards/margins": 0.00740828737616539,
"rewards/rejected": 0.04415445029735565,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 39.81553425430633,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.6671488285064697,
"logits/rejected": -2.5955922603607178,
"logps/chosen": -296.41644287109375,
"logps/rejected": -260.6401672363281,
"loss": 0.6733,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.2127685844898224,
"rewards/margins": 0.04726782441139221,
"rewards/rejected": 0.16550076007843018,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 38.58454153774096,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.5658886432647705,
"logits/rejected": -2.5324325561523438,
"logps/chosen": -259.78594970703125,
"logps/rejected": -241.00991821289062,
"loss": 0.6399,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.3669721484184265,
"rewards/margins": 0.19786901772022247,
"rewards/rejected": 0.16910310089588165,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 37.351752662935816,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.5195257663726807,
"logits/rejected": -2.4827651977539062,
"logps/chosen": -273.65081787109375,
"logps/rejected": -290.78680419921875,
"loss": 0.6094,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.304054319858551,
"rewards/margins": 0.2041884958744049,
"rewards/rejected": 0.09986577928066254,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 39.61129660699584,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.567991018295288,
"logits/rejected": -2.5036864280700684,
"logps/chosen": -260.38055419921875,
"logps/rejected": -294.011474609375,
"loss": 0.6013,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.5612996220588684,
"rewards/margins": 0.3578048348426819,
"rewards/rejected": 0.20349478721618652,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 41.460696281749556,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.460195302963257,
"logits/rejected": -2.46120023727417,
"logps/chosen": -253.1399383544922,
"logps/rejected": -253.4242706298828,
"loss": 0.5693,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.6238263845443726,
"rewards/margins": 0.4591788649559021,
"rewards/rejected": 0.16464750468730927,
"step": 70
},
{
"epoch": 0.17,
"grad_norm": 61.37030849441711,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.615948438644409,
"logits/rejected": -2.5394978523254395,
"logps/chosen": -311.7240295410156,
"logps/rejected": -263.1805725097656,
"loss": 0.5671,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.5546952486038208,
"rewards/margins": 0.5107932686805725,
"rewards/rejected": 0.04390193149447441,
"step": 80
},
{
"epoch": 0.19,
"grad_norm": 39.59727717104598,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.5085294246673584,
"logits/rejected": -2.4543616771698,
"logps/chosen": -251.203369140625,
"logps/rejected": -259.8647766113281,
"loss": 0.5646,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.3953971564769745,
"rewards/margins": 0.7687323689460754,
"rewards/rejected": -0.37333518266677856,
"step": 90
},
{
"epoch": 0.21,
"grad_norm": 36.57841721590594,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.499514102935791,
"logits/rejected": -2.4649369716644287,
"logps/chosen": -248.44363403320312,
"logps/rejected": -257.64776611328125,
"loss": 0.565,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.5957446694374084,
"rewards/margins": 0.6267115473747253,
"rewards/rejected": -0.03096688725054264,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.550398111343384,
"eval_logits/rejected": -2.5104503631591797,
"eval_logps/chosen": -250.69297790527344,
"eval_logps/rejected": -262.7791748046875,
"eval_loss": 0.5717624425888062,
"eval_rewards/accuracies": 0.73828125,
"eval_rewards/chosen": 0.5950239300727844,
"eval_rewards/margins": 0.6006231904029846,
"eval_rewards/rejected": -0.005599223077297211,
"eval_runtime": 96.9486,
"eval_samples_per_second": 20.629,
"eval_steps_per_second": 0.33,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 51.91494841998397,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.55851149559021,
"logits/rejected": -2.4656014442443848,
"logps/chosen": -292.62615966796875,
"logps/rejected": -258.59661865234375,
"loss": 0.5713,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.5976964831352234,
"rewards/margins": 0.6357330083847046,
"rewards/rejected": -0.0380365327000618,
"step": 110
},
{
"epoch": 0.25,
"grad_norm": 70.69069363258822,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -2.4600424766540527,
"logits/rejected": -2.4324684143066406,
"logps/chosen": -270.7308349609375,
"logps/rejected": -260.5433349609375,
"loss": 0.5497,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.5218156576156616,
"rewards/margins": 0.5561539530754089,
"rewards/rejected": -0.03433822840452194,
"step": 120
},
{
"epoch": 0.27,
"grad_norm": 42.312370253489476,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -2.464791774749756,
"logits/rejected": -2.439894676208496,
"logps/chosen": -268.9382019042969,
"logps/rejected": -269.9627685546875,
"loss": 0.5423,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.6532469987869263,
"rewards/margins": 0.7295945882797241,
"rewards/rejected": -0.07634757459163666,
"step": 130
},
{
"epoch": 0.29,
"grad_norm": 40.45859260542855,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -2.5265681743621826,
"logits/rejected": -2.485774517059326,
"logps/chosen": -303.1440124511719,
"logps/rejected": -301.68914794921875,
"loss": 0.5376,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.6410696506500244,
"rewards/margins": 0.6916864514350891,
"rewards/rejected": -0.0506168007850647,
"step": 140
},
{
"epoch": 0.31,
"grad_norm": 41.1747855806655,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -2.5189616680145264,
"logits/rejected": -2.4531705379486084,
"logps/chosen": -272.0736999511719,
"logps/rejected": -276.2969055175781,
"loss": 0.5442,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.5575242042541504,
"rewards/margins": 0.5619192719459534,
"rewards/rejected": -0.004395070485770702,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 48.726180323544725,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -2.5174994468688965,
"logits/rejected": -2.43558406829834,
"logps/chosen": -260.0892028808594,
"logps/rejected": -260.7149658203125,
"loss": 0.5652,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.6489425897598267,
"rewards/margins": 0.8134964108467102,
"rewards/rejected": -0.16455380618572235,
"step": 160
},
{
"epoch": 0.36,
"grad_norm": 49.53825953706789,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -2.4509148597717285,
"logits/rejected": -2.3897039890289307,
"logps/chosen": -239.52261352539062,
"logps/rejected": -233.6277618408203,
"loss": 0.5489,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.49063143134117126,
"rewards/margins": 0.6157802939414978,
"rewards/rejected": -0.12514881789684296,
"step": 170
},
{
"epoch": 0.38,
"grad_norm": 51.08561061111303,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -2.3483898639678955,
"logits/rejected": -2.306784152984619,
"logps/chosen": -247.6396026611328,
"logps/rejected": -231.8523406982422,
"loss": 0.5181,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.43596941232681274,
"rewards/margins": 0.6500319242477417,
"rewards/rejected": -0.21406252682209015,
"step": 180
},
{
"epoch": 0.4,
"grad_norm": 42.31027201995276,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -2.41634464263916,
"logits/rejected": -2.378105878829956,
"logps/chosen": -260.20306396484375,
"logps/rejected": -261.46502685546875,
"loss": 0.5392,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.5519876480102539,
"rewards/margins": 0.6375012993812561,
"rewards/rejected": -0.08551368862390518,
"step": 190
},
{
"epoch": 0.42,
"grad_norm": 102.86207924802177,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -2.392763614654541,
"logits/rejected": -2.381993532180786,
"logps/chosen": -249.912109375,
"logps/rejected": -256.75439453125,
"loss": 0.5467,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.49922746419906616,
"rewards/margins": 0.7344967126846313,
"rewards/rejected": -0.235269233584404,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -2.517864942550659,
"eval_logits/rejected": -2.4783387184143066,
"eval_logps/chosen": -249.6370849609375,
"eval_logps/rejected": -264.89788818359375,
"eval_loss": 0.5432960391044617,
"eval_rewards/accuracies": 0.74609375,
"eval_rewards/chosen": 0.6478186845779419,
"eval_rewards/margins": 0.759353518486023,
"eval_rewards/rejected": -0.11153475195169449,
"eval_runtime": 96.4207,
"eval_samples_per_second": 20.742,
"eval_steps_per_second": 0.332,
"step": 200
},
{
"epoch": 0.44,
"grad_norm": 45.308290366409736,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -2.4460113048553467,
"logits/rejected": -2.391810894012451,
"logps/chosen": -278.56781005859375,
"logps/rejected": -257.2254943847656,
"loss": 0.5436,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.5562152862548828,
"rewards/margins": 0.8551079034805298,
"rewards/rejected": -0.29889267683029175,
"step": 210
},
{
"epoch": 0.46,
"grad_norm": 50.1182470431882,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -2.4605488777160645,
"logits/rejected": -2.42708683013916,
"logps/chosen": -257.90826416015625,
"logps/rejected": -253.3182830810547,
"loss": 0.54,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.3734419643878937,
"rewards/margins": 0.7561658024787903,
"rewards/rejected": -0.382723867893219,
"step": 220
},
{
"epoch": 0.48,
"grad_norm": 43.71024962971359,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -2.490509510040283,
"logits/rejected": -2.4491913318634033,
"logps/chosen": -237.17898559570312,
"logps/rejected": -251.81686401367188,
"loss": 0.5441,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.5843235850334167,
"rewards/margins": 0.7882751226425171,
"rewards/rejected": -0.20395155251026154,
"step": 230
},
{
"epoch": 0.5,
"grad_norm": 44.93616969967234,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -2.533695697784424,
"logits/rejected": -2.500807285308838,
"logps/chosen": -253.635498046875,
"logps/rejected": -253.0944061279297,
"loss": 0.5142,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.5032340884208679,
"rewards/margins": 0.7045356035232544,
"rewards/rejected": -0.2013014256954193,
"step": 240
},
{
"epoch": 0.52,
"grad_norm": 42.68904256130122,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -2.483025074005127,
"logits/rejected": -2.4015185832977295,
"logps/chosen": -285.0963439941406,
"logps/rejected": -263.43560791015625,
"loss": 0.5198,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.6547069549560547,
"rewards/margins": 0.850358784198761,
"rewards/rejected": -0.19565197825431824,
"step": 250
},
{
"epoch": 0.54,
"grad_norm": 45.43502171857602,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -2.516913890838623,
"logits/rejected": -2.478473424911499,
"logps/chosen": -282.80230712890625,
"logps/rejected": -258.77288818359375,
"loss": 0.5235,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.5754625797271729,
"rewards/margins": 0.8150871396064758,
"rewards/rejected": -0.23962458968162537,
"step": 260
},
{
"epoch": 0.56,
"grad_norm": 41.73526734917468,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -2.5381340980529785,
"logits/rejected": -2.4941086769104004,
"logps/chosen": -267.4333190917969,
"logps/rejected": -260.50677490234375,
"loss": 0.5406,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.4781308174133301,
"rewards/margins": 0.6212563514709473,
"rewards/rejected": -0.14312560856342316,
"step": 270
},
{
"epoch": 0.59,
"grad_norm": 48.561323508433155,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -2.510133743286133,
"logits/rejected": -2.4422435760498047,
"logps/chosen": -250.73855590820312,
"logps/rejected": -247.487060546875,
"loss": 0.5599,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.5469261407852173,
"rewards/margins": 0.8119627833366394,
"rewards/rejected": -0.2650366425514221,
"step": 280
},
{
"epoch": 0.61,
"grad_norm": 44.504093632124075,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -2.5589568614959717,
"logits/rejected": -2.5217483043670654,
"logps/chosen": -240.7520751953125,
"logps/rejected": -244.8422088623047,
"loss": 0.5354,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.5167636871337891,
"rewards/margins": 0.661081075668335,
"rewards/rejected": -0.14431743323802948,
"step": 290
},
{
"epoch": 0.63,
"grad_norm": 52.52022669231452,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -2.5429511070251465,
"logits/rejected": -2.472679376602173,
"logps/chosen": -292.2240905761719,
"logps/rejected": -266.68658447265625,
"loss": 0.517,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.4385985732078552,
"rewards/margins": 0.7676541209220886,
"rewards/rejected": -0.329055517911911,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -2.5622596740722656,
"eval_logits/rejected": -2.520256280899048,
"eval_logps/chosen": -251.2219696044922,
"eval_logps/rejected": -268.04449462890625,
"eval_loss": 0.53697669506073,
"eval_rewards/accuracies": 0.76953125,
"eval_rewards/chosen": 0.5685745477676392,
"eval_rewards/margins": 0.8374388217926025,
"eval_rewards/rejected": -0.2688642740249634,
"eval_runtime": 96.3678,
"eval_samples_per_second": 20.754,
"eval_steps_per_second": 0.332,
"step": 300
},
{
"epoch": 0.65,
"grad_norm": 43.866661437938184,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -2.4593474864959717,
"logits/rejected": -2.443233013153076,
"logps/chosen": -285.5498046875,
"logps/rejected": -263.8379821777344,
"loss": 0.5077,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.4893050193786621,
"rewards/margins": 0.7553777098655701,
"rewards/rejected": -0.26607269048690796,
"step": 310
},
{
"epoch": 0.67,
"grad_norm": 43.407860217947494,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -2.4746253490448,
"logits/rejected": -2.4388270378112793,
"logps/chosen": -283.4583740234375,
"logps/rejected": -250.38204956054688,
"loss": 0.504,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.5105848908424377,
"rewards/margins": 0.788524329662323,
"rewards/rejected": -0.2779393792152405,
"step": 320
},
{
"epoch": 0.69,
"grad_norm": 40.302692173545196,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -2.4712371826171875,
"logits/rejected": -2.4099698066711426,
"logps/chosen": -252.349853515625,
"logps/rejected": -264.03912353515625,
"loss": 0.5242,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.4930170178413391,
"rewards/margins": 0.7200408577919006,
"rewards/rejected": -0.2270239144563675,
"step": 330
},
{
"epoch": 0.71,
"grad_norm": 50.168955016672676,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -2.467729091644287,
"logits/rejected": -2.4046943187713623,
"logps/chosen": -278.697509765625,
"logps/rejected": -285.4507141113281,
"loss": 0.524,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.5648467540740967,
"rewards/margins": 0.8352931141853333,
"rewards/rejected": -0.2704463601112366,
"step": 340
},
{
"epoch": 0.73,
"grad_norm": 46.15971070553052,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -2.445666790008545,
"logits/rejected": -2.377004384994507,
"logps/chosen": -248.63241577148438,
"logps/rejected": -248.23904418945312,
"loss": 0.4924,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.47894006967544556,
"rewards/margins": 0.8554509878158569,
"rewards/rejected": -0.37651100754737854,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 54.17198760484943,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -2.4745380878448486,
"logits/rejected": -2.376185178756714,
"logps/chosen": -292.89483642578125,
"logps/rejected": -269.1952209472656,
"loss": 0.5388,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.4898607134819031,
"rewards/margins": 0.8843740224838257,
"rewards/rejected": -0.3945133090019226,
"step": 360
},
{
"epoch": 0.77,
"grad_norm": 44.15468601338237,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -2.409170150756836,
"logits/rejected": -2.367518186569214,
"logps/chosen": -266.35430908203125,
"logps/rejected": -242.5480194091797,
"loss": 0.5384,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.44893550872802734,
"rewards/margins": 0.6646324992179871,
"rewards/rejected": -0.21569697558879852,
"step": 370
},
{
"epoch": 0.79,
"grad_norm": 39.47383320898196,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -2.3523006439208984,
"logits/rejected": -2.3420968055725098,
"logps/chosen": -230.9795379638672,
"logps/rejected": -267.8912658691406,
"loss": 0.5181,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.4252557158470154,
"rewards/margins": 0.8839966058731079,
"rewards/rejected": -0.45874080061912537,
"step": 380
},
{
"epoch": 0.82,
"grad_norm": 48.64961363299689,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -2.4286305904388428,
"logits/rejected": -2.394604206085205,
"logps/chosen": -293.20440673828125,
"logps/rejected": -287.0997009277344,
"loss": 0.5149,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.6216251850128174,
"rewards/margins": 0.7780872583389282,
"rewards/rejected": -0.15646204352378845,
"step": 390
},
{
"epoch": 0.84,
"grad_norm": 44.38080817079193,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -2.4102301597595215,
"logits/rejected": -2.3357295989990234,
"logps/chosen": -259.7147521972656,
"logps/rejected": -273.10699462890625,
"loss": 0.518,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.541024386882782,
"rewards/margins": 0.8457515835762024,
"rewards/rejected": -0.30472710728645325,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -2.4731171131134033,
"eval_logits/rejected": -2.4323782920837402,
"eval_logps/chosen": -250.02178955078125,
"eval_logps/rejected": -267.0915222167969,
"eval_loss": 0.5348160862922668,
"eval_rewards/accuracies": 0.75390625,
"eval_rewards/chosen": 0.6285843849182129,
"eval_rewards/margins": 0.8498014211654663,
"eval_rewards/rejected": -0.22121697664260864,
"eval_runtime": 96.4764,
"eval_samples_per_second": 20.73,
"eval_steps_per_second": 0.332,
"step": 400
},
{
"epoch": 0.86,
"grad_norm": 48.15981350653104,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -2.444577932357788,
"logits/rejected": -2.3699073791503906,
"logps/chosen": -286.5138854980469,
"logps/rejected": -274.3666687011719,
"loss": 0.5226,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.6201778650283813,
"rewards/margins": 0.8976529240608215,
"rewards/rejected": -0.2774750590324402,
"step": 410
},
{
"epoch": 0.88,
"grad_norm": 48.573506313099124,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -2.429912805557251,
"logits/rejected": -2.3931796550750732,
"logps/chosen": -287.13067626953125,
"logps/rejected": -279.46844482421875,
"loss": 0.5212,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.4971562325954437,
"rewards/margins": 0.7474662065505981,
"rewards/rejected": -0.25030994415283203,
"step": 420
},
{
"epoch": 0.9,
"grad_norm": 41.98038749926915,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -2.3573684692382812,
"logits/rejected": -2.3092567920684814,
"logps/chosen": -269.9436950683594,
"logps/rejected": -265.4564514160156,
"loss": 0.501,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.45472264289855957,
"rewards/margins": 0.838543713092804,
"rewards/rejected": -0.38382115960121155,
"step": 430
},
{
"epoch": 0.92,
"grad_norm": 44.22650678163462,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -2.4332785606384277,
"logits/rejected": -2.3776473999023438,
"logps/chosen": -272.65960693359375,
"logps/rejected": -271.44329833984375,
"loss": 0.5213,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.5838757157325745,
"rewards/margins": 0.700454831123352,
"rewards/rejected": -0.1165790781378746,
"step": 440
},
{
"epoch": 0.94,
"grad_norm": 43.00589727019739,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -2.3836779594421387,
"logits/rejected": -2.360665798187256,
"logps/chosen": -284.2134704589844,
"logps/rejected": -312.9830627441406,
"loss": 0.5099,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.4735330641269684,
"rewards/margins": 0.7689631581306458,
"rewards/rejected": -0.29543009400367737,
"step": 450
},
{
"epoch": 0.96,
"grad_norm": 46.86754726240038,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -2.417273998260498,
"logits/rejected": -2.377349376678467,
"logps/chosen": -262.1804504394531,
"logps/rejected": -247.7431182861328,
"loss": 0.5264,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.48920711874961853,
"rewards/margins": 0.7419728636741638,
"rewards/rejected": -0.2527657151222229,
"step": 460
},
{
"epoch": 0.98,
"grad_norm": 45.055740606082026,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -2.4338390827178955,
"logits/rejected": -2.3758208751678467,
"logps/chosen": -268.4836730957031,
"logps/rejected": -289.60205078125,
"loss": 0.4974,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.5107508897781372,
"rewards/margins": 0.8732994794845581,
"rewards/rejected": -0.3625485301017761,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.5478911828795238,
"train_runtime": 7553.9268,
"train_samples_per_second": 8.093,
"train_steps_per_second": 0.063
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}