zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
f41965d verified
raw history blame
No virus
27.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 43894.48099242753,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -1.689455509185791,
"logits/rejected": -1.4794573783874512,
"logps/chosen": -126.21005249023438,
"logps/rejected": -98.13133239746094,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 32305.118552441847,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.7068803310394287,
"logits/rejected": -1.6096948385238647,
"logps/chosen": -139.68423461914062,
"logps/rejected": -91.41385650634766,
"loss": 2.648,
"rewards/accuracies": 0.5069444179534912,
"rewards/chosen": 0.3865443468093872,
"rewards/margins": 1.56412935256958,
"rewards/rejected": -1.1775851249694824,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 12815.76079839475,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.6411230564117432,
"logits/rejected": -1.6499197483062744,
"logps/chosen": -131.1981964111328,
"logps/rejected": -93.75257110595703,
"loss": 0.8229,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 14.764315605163574,
"rewards/margins": 19.453596115112305,
"rewards/rejected": -4.68928337097168,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 8663.076785137986,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.7121353149414062,
"logits/rejected": -1.6375898122787476,
"logps/chosen": -133.71029663085938,
"logps/rejected": -103.07096099853516,
"loss": 0.5133,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 44.873260498046875,
"rewards/margins": 58.086036682128906,
"rewards/rejected": -13.2127685546875,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 9873.577427815002,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.6784517765045166,
"logits/rejected": -1.5826914310455322,
"logps/chosen": -145.05630493164062,
"logps/rejected": -101.44771575927734,
"loss": 0.516,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 66.6468734741211,
"rewards/margins": 95.41236114501953,
"rewards/rejected": -28.765483856201172,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 15484.454406367853,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -1.6393781900405884,
"logits/rejected": -1.6535584926605225,
"logps/chosen": -127.37105560302734,
"logps/rejected": -104.55952453613281,
"loss": 0.6497,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 77.03364562988281,
"rewards/margins": 131.46775817871094,
"rewards/rejected": -54.434104919433594,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 4641.588943610254,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -1.7217296361923218,
"logits/rejected": -1.651254653930664,
"logps/chosen": -141.35108947753906,
"logps/rejected": -108.5528793334961,
"loss": 0.7574,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 96.49129486083984,
"rewards/margins": 164.4349822998047,
"rewards/rejected": -67.94366455078125,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 10223.596716214304,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -1.6282870769500732,
"logits/rejected": -1.6370842456817627,
"logps/chosen": -134.8829803466797,
"logps/rejected": -106.41259765625,
"loss": 1.0394,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 102.15782928466797,
"rewards/margins": 186.63003540039062,
"rewards/rejected": -84.47221374511719,
"step": 70
},
{
"epoch": 0.17,
"grad_norm": 6360.665892121058,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -1.5932952165603638,
"logits/rejected": -1.5602772235870361,
"logps/chosen": -130.39671325683594,
"logps/rejected": -101.85746002197266,
"loss": 1.1921,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 100.63119506835938,
"rewards/margins": 181.93630981445312,
"rewards/rejected": -81.30511474609375,
"step": 80
},
{
"epoch": 0.19,
"grad_norm": 12014.555301109034,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -1.6406339406967163,
"logits/rejected": -1.6412605047225952,
"logps/chosen": -138.44619750976562,
"logps/rejected": -112.9968032836914,
"loss": 0.9751,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 107.64051818847656,
"rewards/margins": 218.6805419921875,
"rewards/rejected": -111.0400390625,
"step": 90
},
{
"epoch": 0.21,
"grad_norm": 11340.576903436586,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -1.6779956817626953,
"logits/rejected": -1.6324456930160522,
"logps/chosen": -118.78487396240234,
"logps/rejected": -101.80384826660156,
"loss": 1.1504,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 91.92900848388672,
"rewards/margins": 203.4224090576172,
"rewards/rejected": -111.493408203125,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -1.724159836769104,
"eval_logits/rejected": -1.6941893100738525,
"eval_logps/chosen": -127.37677764892578,
"eval_logps/rejected": -104.87450408935547,
"eval_loss": 0.9783788323402405,
"eval_rewards/accuracies": 0.91015625,
"eval_rewards/chosen": 103.95471954345703,
"eval_rewards/margins": 220.29249572753906,
"eval_rewards/rejected": -116.33775329589844,
"eval_runtime": 97.7821,
"eval_samples_per_second": 20.454,
"eval_steps_per_second": 0.327,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 9309.434410308813,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -1.6036758422851562,
"logits/rejected": -1.651767373085022,
"logps/chosen": -123.1724624633789,
"logps/rejected": -111.15580749511719,
"loss": 1.3815,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 88.74246978759766,
"rewards/margins": 178.5576934814453,
"rewards/rejected": -89.81523132324219,
"step": 110
},
{
"epoch": 0.25,
"grad_norm": 7646.143632789072,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -1.6596767902374268,
"logits/rejected": -1.639947533607483,
"logps/chosen": -122.2258071899414,
"logps/rejected": -112.39066314697266,
"loss": 1.2948,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 106.12471008300781,
"rewards/margins": 216.6038818359375,
"rewards/rejected": -110.47917175292969,
"step": 120
},
{
"epoch": 0.27,
"grad_norm": 5056.012586834783,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -1.6654844284057617,
"logits/rejected": -1.6651356220245361,
"logps/chosen": -129.16343688964844,
"logps/rejected": -105.808837890625,
"loss": 1.5047,
"rewards/accuracies": 0.90625,
"rewards/chosen": 115.51092529296875,
"rewards/margins": 240.7315673828125,
"rewards/rejected": -125.22064208984375,
"step": 130
},
{
"epoch": 0.29,
"grad_norm": 13712.265823185711,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -1.5624234676361084,
"logits/rejected": -1.5188586711883545,
"logps/chosen": -126.886474609375,
"logps/rejected": -106.10212707519531,
"loss": 2.4129,
"rewards/accuracies": 0.875,
"rewards/chosen": 106.82698822021484,
"rewards/margins": 237.18234252929688,
"rewards/rejected": -130.3553924560547,
"step": 140
},
{
"epoch": 0.31,
"grad_norm": 11133.14391408868,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -1.688401460647583,
"logits/rejected": -1.706859827041626,
"logps/chosen": -124.13720703125,
"logps/rejected": -103.48664855957031,
"loss": 1.5451,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 115.81596374511719,
"rewards/margins": 246.532958984375,
"rewards/rejected": -130.7169647216797,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 13437.263938948909,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -1.5865412950515747,
"logits/rejected": -1.5408028364181519,
"logps/chosen": -131.3970489501953,
"logps/rejected": -111.11344909667969,
"loss": 1.9447,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 105.68785095214844,
"rewards/margins": 227.367919921875,
"rewards/rejected": -121.6800308227539,
"step": 160
},
{
"epoch": 0.36,
"grad_norm": 12185.877768787304,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -1.8080990314483643,
"logits/rejected": -1.7689082622528076,
"logps/chosen": -127.97332763671875,
"logps/rejected": -110.8963623046875,
"loss": 2.3084,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 111.68205261230469,
"rewards/margins": 263.9295349121094,
"rewards/rejected": -152.2474822998047,
"step": 170
},
{
"epoch": 0.38,
"grad_norm": 12397.612395894674,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -1.7653987407684326,
"logits/rejected": -1.7487728595733643,
"logps/chosen": -130.79266357421875,
"logps/rejected": -106.92414855957031,
"loss": 2.1426,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 123.60539245605469,
"rewards/margins": 284.14984130859375,
"rewards/rejected": -160.54443359375,
"step": 180
},
{
"epoch": 0.4,
"grad_norm": 8233.63739133568,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -1.803746223449707,
"logits/rejected": -1.7734184265136719,
"logps/chosen": -120.31190490722656,
"logps/rejected": -120.1562271118164,
"loss": 1.8983,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 114.87654876708984,
"rewards/margins": 295.48638916015625,
"rewards/rejected": -180.6098175048828,
"step": 190
},
{
"epoch": 0.42,
"grad_norm": 8854.142242075086,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -1.748956322669983,
"logits/rejected": -1.7438066005706787,
"logps/chosen": -126.25050354003906,
"logps/rejected": -105.51225280761719,
"loss": 2.8553,
"rewards/accuracies": 0.9375,
"rewards/chosen": 128.85885620117188,
"rewards/margins": 303.90032958984375,
"rewards/rejected": -175.04144287109375,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -1.8518784046173096,
"eval_logits/rejected": -1.8462214469909668,
"eval_logps/chosen": -125.94612121582031,
"eval_logps/rejected": -111.28173828125,
"eval_loss": 1.8849064111709595,
"eval_rewards/accuracies": 0.91015625,
"eval_rewards/chosen": 118.2613525390625,
"eval_rewards/margins": 298.67144775390625,
"eval_rewards/rejected": -180.41009521484375,
"eval_runtime": 97.5342,
"eval_samples_per_second": 20.506,
"eval_steps_per_second": 0.328,
"step": 200
},
{
"epoch": 0.44,
"grad_norm": 11055.40272221904,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -1.6907581090927124,
"logits/rejected": -1.6238548755645752,
"logps/chosen": -122.90483093261719,
"logps/rejected": -103.57493591308594,
"loss": 2.2063,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 96.85926818847656,
"rewards/margins": 236.5902862548828,
"rewards/rejected": -139.7310028076172,
"step": 210
},
{
"epoch": 0.46,
"grad_norm": 8772.23523058132,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -1.832867980003357,
"logits/rejected": -1.8748031854629517,
"logps/chosen": -124.8982162475586,
"logps/rejected": -103.1186752319336,
"loss": 2.1666,
"rewards/accuracies": 0.90625,
"rewards/chosen": 115.77070617675781,
"rewards/margins": 271.67108154296875,
"rewards/rejected": -155.9003448486328,
"step": 220
},
{
"epoch": 0.48,
"grad_norm": 7569.137122040314,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -1.7989298105239868,
"logits/rejected": -1.737357497215271,
"logps/chosen": -135.94210815429688,
"logps/rejected": -119.22425842285156,
"loss": 2.7122,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 117.66719818115234,
"rewards/margins": 311.7781677246094,
"rewards/rejected": -194.11097717285156,
"step": 230
},
{
"epoch": 0.5,
"grad_norm": 9108.756414029493,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -1.831538438796997,
"logits/rejected": -1.8362337350845337,
"logps/chosen": -121.7720947265625,
"logps/rejected": -112.72883605957031,
"loss": 2.3272,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 101.53041076660156,
"rewards/margins": 252.52392578125,
"rewards/rejected": -150.99354553222656,
"step": 240
},
{
"epoch": 0.52,
"grad_norm": 11190.763278546787,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -1.83551824092865,
"logits/rejected": -1.8649381399154663,
"logps/chosen": -125.136474609375,
"logps/rejected": -114.23868560791016,
"loss": 2.3585,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 122.786865234375,
"rewards/margins": 295.3399658203125,
"rewards/rejected": -172.55311584472656,
"step": 250
},
{
"epoch": 0.54,
"grad_norm": 6158.166356558157,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -1.8736432790756226,
"logits/rejected": -1.887500524520874,
"logps/chosen": -129.7669677734375,
"logps/rejected": -101.11165618896484,
"loss": 2.615,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 140.60031127929688,
"rewards/margins": 302.95391845703125,
"rewards/rejected": -162.3535919189453,
"step": 260
},
{
"epoch": 0.56,
"grad_norm": 5058.294410517059,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -1.8732073307037354,
"logits/rejected": -1.817939043045044,
"logps/chosen": -120.4271011352539,
"logps/rejected": -108.13395690917969,
"loss": 2.1432,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 116.31379699707031,
"rewards/margins": 285.315185546875,
"rewards/rejected": -169.00140380859375,
"step": 270
},
{
"epoch": 0.59,
"grad_norm": 8591.417699065232,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -1.7311344146728516,
"logits/rejected": -1.812242865562439,
"logps/chosen": -126.07157897949219,
"logps/rejected": -120.28532409667969,
"loss": 3.1754,
"rewards/accuracies": 0.90625,
"rewards/chosen": 109.85076904296875,
"rewards/margins": 299.208740234375,
"rewards/rejected": -189.35797119140625,
"step": 280
},
{
"epoch": 0.61,
"grad_norm": 10809.88836686454,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -1.8156871795654297,
"logits/rejected": -1.7812505960464478,
"logps/chosen": -116.09767150878906,
"logps/rejected": -107.4920883178711,
"loss": 2.8029,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 120.6517333984375,
"rewards/margins": 304.40826416015625,
"rewards/rejected": -183.7565460205078,
"step": 290
},
{
"epoch": 0.63,
"grad_norm": 7050.7886997426285,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -1.7682945728302002,
"logits/rejected": -1.7057702541351318,
"logps/chosen": -134.6074676513672,
"logps/rejected": -124.12396240234375,
"loss": 2.2897,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 118.32698059082031,
"rewards/margins": 297.16571044921875,
"rewards/rejected": -178.83876037597656,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -1.868285059928894,
"eval_logits/rejected": -1.8641510009765625,
"eval_logps/chosen": -124.9817886352539,
"eval_logps/rejected": -112.93190002441406,
"eval_loss": 2.1029016971588135,
"eval_rewards/accuracies": 0.9140625,
"eval_rewards/chosen": 127.90460968017578,
"eval_rewards/margins": 324.8162536621094,
"eval_rewards/rejected": -196.91163635253906,
"eval_runtime": 97.6004,
"eval_samples_per_second": 20.492,
"eval_steps_per_second": 0.328,
"step": 300
},
{
"epoch": 0.65,
"grad_norm": 11804.0120903992,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -1.7608633041381836,
"logits/rejected": -1.7920173406600952,
"logps/chosen": -126.25215911865234,
"logps/rejected": -115.8753433227539,
"loss": 2.7724,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 130.85523986816406,
"rewards/margins": 304.2178039550781,
"rewards/rejected": -173.3625946044922,
"step": 310
},
{
"epoch": 0.67,
"grad_norm": 10459.35107314203,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -1.7917076349258423,
"logits/rejected": -1.7515103816986084,
"logps/chosen": -116.50152587890625,
"logps/rejected": -114.2782211303711,
"loss": 2.4757,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 142.95352172851562,
"rewards/margins": 323.181884765625,
"rewards/rejected": -180.22837829589844,
"step": 320
},
{
"epoch": 0.69,
"grad_norm": 8336.1056366251,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -1.7024962902069092,
"logits/rejected": -1.7793302536010742,
"logps/chosen": -126.26700592041016,
"logps/rejected": -110.26517486572266,
"loss": 2.626,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 119.15933990478516,
"rewards/margins": 284.33038330078125,
"rewards/rejected": -165.17105102539062,
"step": 330
},
{
"epoch": 0.71,
"grad_norm": 6521.040226026619,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -1.6726289987564087,
"logits/rejected": -1.7343635559082031,
"logps/chosen": -120.09101867675781,
"logps/rejected": -102.69850158691406,
"loss": 3.9728,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 103.992919921875,
"rewards/margins": 254.6125030517578,
"rewards/rejected": -150.61959838867188,
"step": 340
},
{
"epoch": 0.73,
"grad_norm": 9653.282761957253,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -1.7162805795669556,
"logits/rejected": -1.7199398279190063,
"logps/chosen": -120.1608657836914,
"logps/rejected": -121.0823974609375,
"loss": 2.2998,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 99.24105072021484,
"rewards/margins": 294.47723388671875,
"rewards/rejected": -195.23617553710938,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 8483.170816558306,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -1.7057559490203857,
"logits/rejected": -1.7162882089614868,
"logps/chosen": -126.68985748291016,
"logps/rejected": -117.80401611328125,
"loss": 1.8659,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 114.60487365722656,
"rewards/margins": 299.46978759765625,
"rewards/rejected": -184.86489868164062,
"step": 360
},
{
"epoch": 0.77,
"grad_norm": 16292.765448877528,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -1.8123562335968018,
"logits/rejected": -1.7508220672607422,
"logps/chosen": -116.35862731933594,
"logps/rejected": -107.92704010009766,
"loss": 2.4835,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 100.31108093261719,
"rewards/margins": 254.4406280517578,
"rewards/rejected": -154.12954711914062,
"step": 370
},
{
"epoch": 0.79,
"grad_norm": 12994.314264805473,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -1.84799063205719,
"logits/rejected": -1.857129693031311,
"logps/chosen": -124.62117004394531,
"logps/rejected": -118.90538024902344,
"loss": 2.1362,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 120.3254623413086,
"rewards/margins": 317.2894592285156,
"rewards/rejected": -196.96397399902344,
"step": 380
},
{
"epoch": 0.82,
"grad_norm": 11506.166124614792,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -1.7219364643096924,
"logits/rejected": -1.7155206203460693,
"logps/chosen": -119.84449768066406,
"logps/rejected": -109.42668151855469,
"loss": 2.056,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 126.5132827758789,
"rewards/margins": 317.73858642578125,
"rewards/rejected": -191.22531127929688,
"step": 390
},
{
"epoch": 0.84,
"grad_norm": 14299.186863928613,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -1.7028141021728516,
"logits/rejected": -1.7706788778305054,
"logps/chosen": -122.94708251953125,
"logps/rejected": -110.153564453125,
"loss": 2.2714,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 98.14391326904297,
"rewards/margins": 283.8785400390625,
"rewards/rejected": -185.73460388183594,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -1.8724907636642456,
"eval_logits/rejected": -1.8708370923995972,
"eval_logps/chosen": -125.25865936279297,
"eval_logps/rejected": -113.72941589355469,
"eval_loss": 1.8651787042617798,
"eval_rewards/accuracies": 0.9140625,
"eval_rewards/chosen": 125.13589477539062,
"eval_rewards/margins": 330.022705078125,
"eval_rewards/rejected": -204.8868408203125,
"eval_runtime": 97.6945,
"eval_samples_per_second": 20.472,
"eval_steps_per_second": 0.328,
"step": 400
},
{
"epoch": 0.86,
"grad_norm": 9838.615615925528,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -1.7946465015411377,
"logits/rejected": -1.8112504482269287,
"logps/chosen": -134.29222106933594,
"logps/rejected": -112.4157943725586,
"loss": 2.3885,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 130.2279815673828,
"rewards/margins": 333.3272705078125,
"rewards/rejected": -203.0992889404297,
"step": 410
},
{
"epoch": 0.88,
"grad_norm": 9593.489304491115,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -1.7704830169677734,
"logits/rejected": -1.758772611618042,
"logps/chosen": -118.04278564453125,
"logps/rejected": -109.47645568847656,
"loss": 1.872,
"rewards/accuracies": 0.90625,
"rewards/chosen": 102.66886138916016,
"rewards/margins": 290.7878112792969,
"rewards/rejected": -188.11898803710938,
"step": 420
},
{
"epoch": 0.9,
"grad_norm": 6763.365413342396,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -1.710999846458435,
"logits/rejected": -1.7392107248306274,
"logps/chosen": -134.11866760253906,
"logps/rejected": -109.66754150390625,
"loss": 2.0497,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 119.90589904785156,
"rewards/margins": 281.7261657714844,
"rewards/rejected": -161.82025146484375,
"step": 430
},
{
"epoch": 0.92,
"grad_norm": 8451.951191648946,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -1.852447748184204,
"logits/rejected": -1.8499510288238525,
"logps/chosen": -125.65571594238281,
"logps/rejected": -113.3338394165039,
"loss": 2.2476,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 133.1741485595703,
"rewards/margins": 332.48126220703125,
"rewards/rejected": -199.3070831298828,
"step": 440
},
{
"epoch": 0.94,
"grad_norm": 9580.867576787692,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -1.8617397546768188,
"logits/rejected": -1.8191306591033936,
"logps/chosen": -121.5869369506836,
"logps/rejected": -108.55745697021484,
"loss": 2.5789,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 137.1997528076172,
"rewards/margins": 339.0559387207031,
"rewards/rejected": -201.85618591308594,
"step": 450
},
{
"epoch": 0.96,
"grad_norm": 4611.3001759975,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -1.7538158893585205,
"logits/rejected": -1.7651903629302979,
"logps/chosen": -126.25445556640625,
"logps/rejected": -113.3001937866211,
"loss": 1.6224,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 130.99368286132812,
"rewards/margins": 307.7095642089844,
"rewards/rejected": -176.7158660888672,
"step": 460
},
{
"epoch": 0.98,
"grad_norm": 8182.843310129901,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -1.728009581565857,
"logits/rejected": -1.7379541397094727,
"logps/chosen": -119.18067932128906,
"logps/rejected": -124.09730529785156,
"loss": 2.184,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 105.3095703125,
"rewards/margins": 308.53289794921875,
"rewards/rejected": -203.22329711914062,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 2.0128297995323914,
"train_runtime": 7588.5519,
"train_samples_per_second": 8.056,
"train_steps_per_second": 0.063
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}