zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
62106cc verified
raw
history blame
No virus
27.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 74.50819179863889,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -2.7660439014434814,
"logits/rejected": -2.717564582824707,
"logps/chosen": -269.8568420410156,
"logps/rejected": -360.52459716796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 71.5827858042053,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.592801809310913,
"logits/rejected": -2.5633366107940674,
"logps/chosen": -264.5331726074219,
"logps/rejected": -251.33367919921875,
"loss": 0.6884,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 0.2647041380405426,
"rewards/margins": 0.0454571396112442,
"rewards/rejected": 0.2192470282316208,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 33.37630632393394,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.6635663509368896,
"logits/rejected": -2.6177525520324707,
"logps/chosen": -275.1928405761719,
"logps/rejected": -290.4365539550781,
"loss": 0.5763,
"rewards/accuracies": 0.5,
"rewards/chosen": 6.3604888916015625,
"rewards/margins": -0.009852093644440174,
"rewards/rejected": 6.370340824127197,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 22.1278736890366,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.7272486686706543,
"logits/rejected": -2.667067527770996,
"logps/chosen": -285.1613464355469,
"logps/rejected": -249.3108367919922,
"loss": 0.4416,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 15.510467529296875,
"rewards/margins": 0.8711569905281067,
"rewards/rejected": 14.639310836791992,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 17.071895487907064,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.6888694763183594,
"logits/rejected": -2.6701016426086426,
"logps/chosen": -247.84716796875,
"logps/rejected": -227.38131713867188,
"loss": 0.3982,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 19.278215408325195,
"rewards/margins": 2.267552137374878,
"rewards/rejected": 17.010662078857422,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 14.78162706214556,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -2.659508466720581,
"logits/rejected": -2.6249804496765137,
"logps/chosen": -259.9454650878906,
"logps/rejected": -272.14227294921875,
"loss": 0.3676,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 19.786420822143555,
"rewards/margins": -0.8553922772407532,
"rewards/rejected": 20.64181137084961,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 14.285832773490087,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -2.6977388858795166,
"logits/rejected": -2.654181957244873,
"logps/chosen": -247.1780242919922,
"logps/rejected": -275.7373962402344,
"loss": 0.3521,
"rewards/accuracies": 0.5625,
"rewards/chosen": 24.428516387939453,
"rewards/margins": 2.0845706462860107,
"rewards/rejected": 22.343944549560547,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 14.416469937136577,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -2.6118428707122803,
"logits/rejected": -2.625479221343994,
"logps/chosen": -239.4540252685547,
"logps/rejected": -232.90463256835938,
"loss": 0.3304,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 26.162424087524414,
"rewards/margins": 2.349818706512451,
"rewards/rejected": 23.812606811523438,
"step": 70
},
{
"epoch": 0.17,
"grad_norm": 15.840881084472352,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -2.7612788677215576,
"logits/rejected": -2.7243030071258545,
"logps/chosen": -295.0336608886719,
"logps/rejected": -240.8730010986328,
"loss": 0.3248,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 27.784252166748047,
"rewards/margins": 4.598628997802734,
"rewards/rejected": 23.185623168945312,
"step": 80
},
{
"epoch": 0.19,
"grad_norm": 13.661268677283298,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -2.6661014556884766,
"logits/rejected": -2.645249128341675,
"logps/chosen": -231.57553100585938,
"logps/rejected": -228.09091186523438,
"loss": 0.3223,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 27.535770416259766,
"rewards/margins": 3.228619337081909,
"rewards/rejected": 24.30714988708496,
"step": 90
},
{
"epoch": 0.21,
"grad_norm": 11.61288143003843,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -2.6386702060699463,
"logits/rejected": -2.6339759826660156,
"logps/chosen": -233.39047241210938,
"logps/rejected": -232.5922393798828,
"loss": 0.3163,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 26.968032836914062,
"rewards/margins": 2.5318057537078857,
"rewards/rejected": 24.436227798461914,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -2.6968541145324707,
"eval_logits/rejected": -2.670072555541992,
"eval_logps/chosen": -235.37875366210938,
"eval_logps/rejected": -238.44345092773438,
"eval_loss": 0.31289389729499817,
"eval_rewards/accuracies": 0.58203125,
"eval_rewards/chosen": 27.21471405029297,
"eval_rewards/margins": 2.99098801612854,
"eval_rewards/rejected": 24.223726272583008,
"eval_runtime": 96.735,
"eval_samples_per_second": 20.675,
"eval_steps_per_second": 0.331,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 11.688620320219954,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -2.7125041484832764,
"logits/rejected": -2.6624934673309326,
"logps/chosen": -276.029052734375,
"logps/rejected": -234.1141815185547,
"loss": 0.3136,
"rewards/accuracies": 0.625,
"rewards/chosen": 28.551036834716797,
"rewards/margins": 4.829342842102051,
"rewards/rejected": 23.72169303894043,
"step": 110
},
{
"epoch": 0.25,
"grad_norm": 14.849649400244427,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -2.6516470909118652,
"logits/rejected": -2.647688865661621,
"logps/chosen": -253.4019317626953,
"logps/rejected": -234.5045623779297,
"loss": 0.3065,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 27.765233993530273,
"rewards/margins": 2.4132068157196045,
"rewards/rejected": 25.352027893066406,
"step": 120
},
{
"epoch": 0.27,
"grad_norm": 12.095477452171375,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -2.679412364959717,
"logits/rejected": -2.6742541790008545,
"logps/chosen": -249.6054229736328,
"logps/rejected": -241.8912811279297,
"loss": 0.2993,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 32.39772415161133,
"rewards/margins": 5.853152275085449,
"rewards/rejected": 26.544570922851562,
"step": 130
},
{
"epoch": 0.29,
"grad_norm": 13.237989201417717,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -2.7010607719421387,
"logits/rejected": -2.689103603363037,
"logps/chosen": -284.6669921875,
"logps/rejected": -270.44970703125,
"loss": 0.3016,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 31.298425674438477,
"rewards/margins": 1.071274995803833,
"rewards/rejected": 30.22715187072754,
"step": 140
},
{
"epoch": 0.31,
"grad_norm": 11.533759549255185,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -2.6910300254821777,
"logits/rejected": -2.6623480319976807,
"logps/chosen": -251.215576171875,
"logps/rejected": -248.98348999023438,
"loss": 0.2985,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 32.008628845214844,
"rewards/margins": 4.783123970031738,
"rewards/rejected": 27.225509643554688,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 13.117822478323479,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -2.721895217895508,
"logits/rejected": -2.675842523574829,
"logps/chosen": -242.4053192138672,
"logps/rejected": -230.8060302734375,
"loss": 0.3009,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 30.662723541259766,
"rewards/margins": 4.044883728027344,
"rewards/rejected": 26.61783790588379,
"step": 160
},
{
"epoch": 0.36,
"grad_norm": 11.340151801902158,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -2.670436382293701,
"logits/rejected": -2.632450819015503,
"logps/chosen": -220.5222625732422,
"logps/rejected": -204.80908203125,
"loss": 0.2938,
"rewards/accuracies": 0.59375,
"rewards/chosen": 28.81294822692871,
"rewards/margins": 2.497253179550171,
"rewards/rejected": 26.31569480895996,
"step": 170
},
{
"epoch": 0.38,
"grad_norm": 11.477634324684333,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -2.646768093109131,
"logits/rejected": -2.6306955814361572,
"logps/chosen": -225.45016479492188,
"logps/rejected": -200.42015075683594,
"loss": 0.2914,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 30.908817291259766,
"rewards/margins": 3.7578415870666504,
"rewards/rejected": 27.150976181030273,
"step": 180
},
{
"epoch": 0.4,
"grad_norm": 13.566633133843082,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -2.679771900177002,
"logits/rejected": -2.6499440670013428,
"logps/chosen": -241.45156860351562,
"logps/rejected": -231.2630615234375,
"loss": 0.2963,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 29.79128646850586,
"rewards/margins": 1.2995483875274658,
"rewards/rejected": 28.49173927307129,
"step": 190
},
{
"epoch": 0.42,
"grad_norm": 16.736011308973627,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -2.630007028579712,
"logits/rejected": -2.6183559894561768,
"logps/chosen": -230.09048461914062,
"logps/rejected": -223.8180694580078,
"loss": 0.2918,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 29.806177139282227,
"rewards/margins": 1.575269341468811,
"rewards/rejected": 28.230907440185547,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -2.708475112915039,
"eval_logits/rejected": -2.682575225830078,
"eval_logps/chosen": -232.24124145507812,
"eval_logps/rejected": -236.21038818359375,
"eval_loss": 0.29230329394340515,
"eval_rewards/accuracies": 0.58203125,
"eval_rewards/chosen": 30.35222816467285,
"eval_rewards/margins": 3.8954334259033203,
"eval_rewards/rejected": 26.45679473876953,
"eval_runtime": 96.829,
"eval_samples_per_second": 20.655,
"eval_steps_per_second": 0.33,
"step": 200
},
{
"epoch": 0.44,
"grad_norm": 11.417465496451523,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -2.6330389976501465,
"logits/rejected": -2.6055209636688232,
"logps/chosen": -257.6673889160156,
"logps/rejected": -225.943359375,
"loss": 0.2902,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 32.02475357055664,
"rewards/margins": 6.720486640930176,
"rewards/rejected": 25.304264068603516,
"step": 210
},
{
"epoch": 0.46,
"grad_norm": 12.04727391696027,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -2.5957412719726562,
"logits/rejected": -2.5795822143554688,
"logps/chosen": -233.29476928710938,
"logps/rejected": -217.3531951904297,
"loss": 0.2919,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 32.082313537597656,
"rewards/margins": 3.7717392444610596,
"rewards/rejected": 28.310577392578125,
"step": 220
},
{
"epoch": 0.48,
"grad_norm": 11.505656123665526,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -2.6124305725097656,
"logits/rejected": -2.5944228172302246,
"logps/chosen": -217.5354461669922,
"logps/rejected": -220.5460205078125,
"loss": 0.3047,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 31.32999038696289,
"rewards/margins": 4.138183116912842,
"rewards/rejected": 27.19180679321289,
"step": 230
},
{
"epoch": 0.5,
"grad_norm": 11.083392566284138,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -2.660727024078369,
"logits/rejected": -2.6385245323181152,
"logps/chosen": -232.0665740966797,
"logps/rejected": -219.62210083007812,
"loss": 0.2834,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 31.633642196655273,
"rewards/margins": 2.1873562335968018,
"rewards/rejected": 29.446285247802734,
"step": 240
},
{
"epoch": 0.52,
"grad_norm": 11.463127161742676,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -2.6206917762756348,
"logits/rejected": -2.576387405395508,
"logps/chosen": -264.06439208984375,
"logps/rejected": -229.7786865234375,
"loss": 0.2818,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 34.12608337402344,
"rewards/margins": 4.382205009460449,
"rewards/rejected": 29.743881225585938,
"step": 250
},
{
"epoch": 0.54,
"grad_norm": 10.661524920447267,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -2.6774675846099854,
"logits/rejected": -2.668527364730835,
"logps/chosen": -260.33514404296875,
"logps/rejected": -225.80810546875,
"loss": 0.2858,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 33.976402282714844,
"rewards/margins": 5.804098606109619,
"rewards/rejected": 28.17230224609375,
"step": 260
},
{
"epoch": 0.56,
"grad_norm": 11.916616915089687,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -2.6731224060058594,
"logits/rejected": -2.6551766395568848,
"logps/chosen": -245.6435089111328,
"logps/rejected": -228.1649932861328,
"loss": 0.2808,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 31.35245704650879,
"rewards/margins": 1.8731645345687866,
"rewards/rejected": 29.479290008544922,
"step": 270
},
{
"epoch": 0.59,
"grad_norm": 11.982078860289866,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -2.6452529430389404,
"logits/rejected": -2.6127915382385254,
"logps/chosen": -229.02554321289062,
"logps/rejected": -215.188720703125,
"loss": 0.2835,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 32.651554107666016,
"rewards/margins": 5.653929233551025,
"rewards/rejected": 26.99761962890625,
"step": 280
},
{
"epoch": 0.61,
"grad_norm": 11.17239233559609,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -2.675553321838379,
"logits/rejected": -2.662069082260132,
"logps/chosen": -219.8170928955078,
"logps/rejected": -211.7806396484375,
"loss": 0.2849,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 31.27024269104004,
"rewards/margins": 1.0949894189834595,
"rewards/rejected": 30.175247192382812,
"step": 290
},
{
"epoch": 0.63,
"grad_norm": 9.847053265544167,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -2.65397572517395,
"logits/rejected": -2.6134414672851562,
"logps/chosen": -268.84588623046875,
"logps/rejected": -232.80752563476562,
"loss": 0.286,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 32.15021514892578,
"rewards/margins": 4.852233409881592,
"rewards/rejected": 27.297988891601562,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -2.695726156234741,
"eval_logits/rejected": -2.6716713905334473,
"eval_logps/chosen": -231.15402221679688,
"eval_logps/rejected": -235.42864990234375,
"eval_loss": 0.29209351539611816,
"eval_rewards/accuracies": 0.58203125,
"eval_rewards/chosen": 31.439437866210938,
"eval_rewards/margins": 4.200903415679932,
"eval_rewards/rejected": 27.238534927368164,
"eval_runtime": 96.789,
"eval_samples_per_second": 20.664,
"eval_steps_per_second": 0.331,
"step": 300
},
{
"epoch": 0.65,
"grad_norm": 11.299461074514115,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -2.6087942123413086,
"logits/rejected": -2.607959270477295,
"logps/chosen": -263.2939758300781,
"logps/rejected": -229.5752716064453,
"loss": 0.2804,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 32.041908264160156,
"rewards/margins": 3.100654363632202,
"rewards/rejected": 28.941247940063477,
"step": 310
},
{
"epoch": 0.67,
"grad_norm": 11.979925902064297,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -2.652468204498291,
"logits/rejected": -2.6433398723602295,
"logps/chosen": -260.83233642578125,
"logps/rejected": -216.2664337158203,
"loss": 0.2788,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 32.8377571105957,
"rewards/margins": 4.280916213989258,
"rewards/rejected": 28.556838989257812,
"step": 320
},
{
"epoch": 0.69,
"grad_norm": 10.289416601586245,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -2.678496837615967,
"logits/rejected": -2.634920835494995,
"logps/chosen": -229.55624389648438,
"logps/rejected": -231.64407348632812,
"loss": 0.2812,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 32.6539306640625,
"rewards/margins": 4.799349784851074,
"rewards/rejected": 27.854583740234375,
"step": 330
},
{
"epoch": 0.71,
"grad_norm": 12.940304501019066,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -2.687782049179077,
"logits/rejected": -2.6474757194519043,
"logps/chosen": -258.529541015625,
"logps/rejected": -247.69125366210938,
"loss": 0.2752,
"rewards/accuracies": 0.53125,
"rewards/chosen": 31.464908599853516,
"rewards/margins": -0.8856052160263062,
"rewards/rejected": 32.35051727294922,
"step": 340
},
{
"epoch": 0.73,
"grad_norm": 13.446019747621028,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -2.6681811809539795,
"logits/rejected": -2.6358139514923096,
"logps/chosen": -227.58425903320312,
"logps/rejected": -212.9467010498047,
"loss": 0.2866,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 30.626983642578125,
"rewards/margins": 2.8648905754089355,
"rewards/rejected": 27.7620906829834,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 10.212615361555141,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -2.691338300704956,
"logits/rejected": -2.6329030990600586,
"logps/chosen": -269.2547302246094,
"logps/rejected": -233.14053344726562,
"loss": 0.2785,
"rewards/accuracies": 0.625,
"rewards/chosen": 33.437278747558594,
"rewards/margins": 5.27285623550415,
"rewards/rejected": 28.1644287109375,
"step": 360
},
{
"epoch": 0.77,
"grad_norm": 12.701608094493194,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -2.6507325172424316,
"logits/rejected": -2.6226696968078613,
"logps/chosen": -243.0960693359375,
"logps/rejected": -207.664794921875,
"loss": 0.2854,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 32.23695373535156,
"rewards/margins": 1.6676933765411377,
"rewards/rejected": 30.569263458251953,
"step": 370
},
{
"epoch": 0.79,
"grad_norm": 11.004484883830752,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -2.592874526977539,
"logits/rejected": -2.5939741134643555,
"logps/chosen": -206.689697265625,
"logps/rejected": -228.67898559570312,
"loss": 0.2774,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 32.79497146606445,
"rewards/margins": 2.7575299739837646,
"rewards/rejected": 30.037445068359375,
"step": 380
},
{
"epoch": 0.82,
"grad_norm": 12.608909298282311,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -2.6360385417938232,
"logits/rejected": -2.6261894702911377,
"logps/chosen": -270.9910888671875,
"logps/rejected": -252.8332977294922,
"loss": 0.276,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 34.645816802978516,
"rewards/margins": 3.508648633956909,
"rewards/rejected": 31.137165069580078,
"step": 390
},
{
"epoch": 0.84,
"grad_norm": 11.347134923103408,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -2.632523775100708,
"logits/rejected": -2.594832181930542,
"logps/chosen": -236.8807830810547,
"logps/rejected": -237.6399688720703,
"loss": 0.2819,
"rewards/accuracies": 0.625,
"rewards/chosen": 33.6544189453125,
"rewards/margins": 4.281933784484863,
"rewards/rejected": 29.372488021850586,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -2.6868975162506104,
"eval_logits/rejected": -2.66192626953125,
"eval_logps/chosen": -230.7387237548828,
"eval_logps/rejected": -235.19105529785156,
"eval_loss": 0.2787904143333435,
"eval_rewards/accuracies": 0.578125,
"eval_rewards/chosen": 31.854747772216797,
"eval_rewards/margins": 4.3786234855651855,
"eval_rewards/rejected": 27.476125717163086,
"eval_runtime": 96.6885,
"eval_samples_per_second": 20.685,
"eval_steps_per_second": 0.331,
"step": 400
},
{
"epoch": 0.86,
"grad_norm": 12.175943173191595,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -2.6695199012756348,
"logits/rejected": -2.626798152923584,
"logps/chosen": -263.4989318847656,
"logps/rejected": -240.9721221923828,
"loss": 0.2806,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 35.418556213378906,
"rewards/margins": 7.573515892028809,
"rewards/rejected": 27.845043182373047,
"step": 410
},
{
"epoch": 0.88,
"grad_norm": 11.7624491150407,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -2.6308817863464355,
"logits/rejected": -2.620222568511963,
"logps/chosen": -264.280517578125,
"logps/rejected": -247.2097625732422,
"loss": 0.2882,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 32.79326248168945,
"rewards/margins": 5.5407843589782715,
"rewards/rejected": 27.252477645874023,
"step": 420
},
{
"epoch": 0.9,
"grad_norm": 11.16296113559481,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -2.577580213546753,
"logits/rejected": -2.5429909229278564,
"logps/chosen": -248.5481719970703,
"logps/rejected": -228.4681396484375,
"loss": 0.2851,
"rewards/accuracies": 0.53125,
"rewards/chosen": 30.489971160888672,
"rewards/margins": 1.1781085729599,
"rewards/rejected": 29.311859130859375,
"step": 430
},
{
"epoch": 0.92,
"grad_norm": 10.453636294498436,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -2.654780864715576,
"logits/rejected": -2.619481086730957,
"logps/chosen": -251.1508026123047,
"logps/rejected": -240.0060272216797,
"loss": 0.2805,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 33.18633270263672,
"rewards/margins": 4.080627918243408,
"rewards/rejected": 29.1057071685791,
"step": 440
},
{
"epoch": 0.94,
"grad_norm": 10.779162534358996,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -2.598240375518799,
"logits/rejected": -2.6028037071228027,
"logps/chosen": -259.9753112792969,
"logps/rejected": -276.95166015625,
"loss": 0.2836,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 33.70884323120117,
"rewards/margins": 3.5860488414764404,
"rewards/rejected": 30.122793197631836,
"step": 450
},
{
"epoch": 0.96,
"grad_norm": 12.07874608208951,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -2.6384501457214355,
"logits/rejected": -2.618943452835083,
"logps/chosen": -240.47885131835938,
"logps/rejected": -213.6422882080078,
"loss": 0.2815,
"rewards/accuracies": 0.5625,
"rewards/chosen": 31.485698699951172,
"rewards/margins": 2.44018292427063,
"rewards/rejected": 29.045513153076172,
"step": 460
},
{
"epoch": 0.98,
"grad_norm": 11.390948919388384,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -2.6327641010284424,
"logits/rejected": -2.6079437732696533,
"logps/chosen": -245.8006591796875,
"logps/rejected": -253.76730346679688,
"loss": 0.2778,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 32.898033142089844,
"rewards/margins": 4.314266204833984,
"rewards/rejected": 28.58376121520996,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.31381568898715734,
"train_runtime": 7749.4814,
"train_samples_per_second": 7.889,
"train_steps_per_second": 0.062
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}