tulu-2-7b-full-UF-5e-7 / trainer_state.json
just1nseo's picture
Model save
42963e7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1359,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 6.209654163226836,
"learning_rate": 3.676470588235294e-09,
"logits/chosen": -1.4681403636932373,
"logits/rejected": -0.8821791410446167,
"logps/chosen": -326.7279052734375,
"logps/rejected": -393.66143798828125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 7.833527457219724,
"learning_rate": 3.676470588235294e-08,
"logits/chosen": -1.1554194688796997,
"logits/rejected": -1.069737434387207,
"logps/chosen": -260.11224365234375,
"logps/rejected": -278.21954345703125,
"loss": 0.693,
"rewards/accuracies": 0.3888888955116272,
"rewards/chosen": 0.001127632916904986,
"rewards/margins": 0.001941706403158605,
"rewards/margins_max": 0.0066660139709711075,
"rewards/margins_min": -0.0027826009318232536,
"rewards/margins_std": 0.006681179627776146,
"rewards/rejected": -0.0008140733698382974,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 5.539912592900294,
"learning_rate": 7.352941176470588e-08,
"logits/chosen": -1.1387906074523926,
"logits/rejected": -1.2151895761489868,
"logps/chosen": -226.5954132080078,
"logps/rejected": -194.97735595703125,
"loss": 0.6928,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0004003068897873163,
"rewards/margins": 0.0006232298910617828,
"rewards/margins_max": 0.0029323583003133535,
"rewards/margins_min": -0.0016858980525285006,
"rewards/margins_std": 0.0032655999530106783,
"rewards/rejected": -0.0002229233068646863,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 9.074297699065875,
"learning_rate": 1.1029411764705881e-07,
"logits/chosen": -0.9134622812271118,
"logits/rejected": -1.1061055660247803,
"logps/chosen": -286.9056091308594,
"logps/rejected": -306.0609130859375,
"loss": 0.693,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0033608167432248592,
"rewards/margins": 0.0010996473720297217,
"rewards/margins_max": 0.003882316406816244,
"rewards/margins_min": -0.0016830215463414788,
"rewards/margins_std": 0.003935288172215223,
"rewards/rejected": 0.0022611692547798157,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 6.630354149069055,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -0.9963411092758179,
"logits/rejected": -1.3301975727081299,
"logps/chosen": -237.13650512695312,
"logps/rejected": -233.420654296875,
"loss": 0.6928,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0003052559623029083,
"rewards/margins": -0.00031435777782462537,
"rewards/margins_max": 0.003875983878970146,
"rewards/margins_min": -0.0045046997256577015,
"rewards/margins_std": 0.005926038138568401,
"rewards/rejected": 9.101861905946862e-06,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 5.11233300662631,
"learning_rate": 1.8382352941176472e-07,
"logits/chosen": -0.9264333844184875,
"logits/rejected": -1.0728222131729126,
"logps/chosen": -219.332763671875,
"logps/rejected": -220.7531280517578,
"loss": 0.6927,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.0008567962795495987,
"rewards/margins": 0.0023684161715209484,
"rewards/margins_max": 0.00644815806299448,
"rewards/margins_min": -0.001711326651275158,
"rewards/margins_std": 0.005769627168774605,
"rewards/rejected": -0.001511619659140706,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 6.3634169311373245,
"learning_rate": 2.2058823529411763e-07,
"logits/chosen": -1.1445600986480713,
"logits/rejected": -1.3254610300064087,
"logps/chosen": -269.0830993652344,
"logps/rejected": -234.78726196289062,
"loss": 0.6913,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0016944237286224961,
"rewards/margins": 0.004126362036913633,
"rewards/margins_max": 0.006431617774069309,
"rewards/margins_min": 0.0018211060669273138,
"rewards/margins_std": 0.0032601244747638702,
"rewards/rejected": -0.0024319379590451717,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 5.891925253538908,
"learning_rate": 2.5735294117647057e-07,
"logits/chosen": -1.414535403251648,
"logits/rejected": -1.5020934343338013,
"logps/chosen": -295.0069580078125,
"logps/rejected": -283.39984130859375,
"loss": 0.6904,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.003248781431466341,
"rewards/margins": 0.005960130598396063,
"rewards/margins_max": 0.01083610113710165,
"rewards/margins_min": 0.0010841598268598318,
"rewards/margins_std": 0.006895663682371378,
"rewards/rejected": -0.0027113493997603655,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 27.08598358859191,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -1.1046959161758423,
"logits/rejected": -1.121512770652771,
"logps/chosen": -233.47909545898438,
"logps/rejected": -228.24447631835938,
"loss": 0.6879,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.0034522090572863817,
"rewards/margins": 0.010937942191958427,
"rewards/margins_max": 0.015041169710457325,
"rewards/margins_min": 0.006834716536104679,
"rewards/margins_std": 0.005802837200462818,
"rewards/rejected": -0.007485733367502689,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 5.043695997862729,
"learning_rate": 3.3088235294117644e-07,
"logits/chosen": -1.1739518642425537,
"logits/rejected": -1.1855499744415283,
"logps/chosen": -201.79940795898438,
"logps/rejected": -239.0184783935547,
"loss": 0.6861,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.005821605678647757,
"rewards/margins": 0.016343099996447563,
"rewards/margins_max": 0.022502990439534187,
"rewards/margins_min": 0.010183211416006088,
"rewards/margins_std": 0.008711399510502815,
"rewards/rejected": -0.010521495714783669,
"step": 90
},
{
"epoch": 0.07,
"grad_norm": 6.240948244130348,
"learning_rate": 3.6764705882352943e-07,
"logits/chosen": -1.226905345916748,
"logits/rejected": -1.402093529701233,
"logps/chosen": -276.8337707519531,
"logps/rejected": -248.4552459716797,
"loss": 0.6816,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.008135917596518993,
"rewards/margins": 0.024016622453927994,
"rewards/margins_max": 0.033059027045965195,
"rewards/margins_min": 0.014974219724535942,
"rewards/margins_std": 0.012787890620529652,
"rewards/rejected": -0.015880707651376724,
"step": 100
},
{
"epoch": 0.07,
"eval_logits/chosen": -1.1694660186767578,
"eval_logits/rejected": -1.1956290006637573,
"eval_logps/chosen": -345.8330993652344,
"eval_logps/rejected": -336.38427734375,
"eval_loss": 0.6919357776641846,
"eval_rewards/accuracies": 0.5416666865348816,
"eval_rewards/chosen": 2.3678861907683313e-05,
"eval_rewards/margins": 0.002057413337752223,
"eval_rewards/margins_max": 0.027664856985211372,
"eval_rewards/margins_min": -0.02450541965663433,
"eval_rewards/margins_std": 0.017513444647192955,
"eval_rewards/rejected": -0.002033734694123268,
"eval_runtime": 419.0939,
"eval_samples_per_second": 9.544,
"eval_steps_per_second": 0.15,
"step": 100
},
{
"epoch": 0.08,
"grad_norm": 5.566894084924568,
"learning_rate": 4.044117647058823e-07,
"logits/chosen": -1.3186091184616089,
"logits/rejected": -1.2772490978240967,
"logps/chosen": -379.5386657714844,
"logps/rejected": -246.4805450439453,
"loss": 0.6773,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.007909432053565979,
"rewards/margins": 0.024571493268013,
"rewards/margins_max": 0.036393627524375916,
"rewards/margins_min": 0.012749359011650085,
"rewards/margins_std": 0.016719024628400803,
"rewards/rejected": -0.01666206307709217,
"step": 110
},
{
"epoch": 0.09,
"grad_norm": 5.5380569300473175,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -0.9861418008804321,
"logits/rejected": -1.2131096124649048,
"logps/chosen": -280.57135009765625,
"logps/rejected": -222.57217407226562,
"loss": 0.6696,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.017649073153734207,
"rewards/margins": 0.06698472797870636,
"rewards/margins_max": 0.10137734562158585,
"rewards/margins_min": 0.032592128962278366,
"rewards/margins_std": 0.0486384816467762,
"rewards/rejected": -0.04933566227555275,
"step": 120
},
{
"epoch": 0.1,
"grad_norm": 15.361998761868088,
"learning_rate": 4.779411764705882e-07,
"logits/chosen": -1.0785776376724243,
"logits/rejected": -0.898257851600647,
"logps/chosen": -283.1363525390625,
"logps/rejected": -214.15316772460938,
"loss": 0.6611,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02150227688252926,
"rewards/margins": 0.08108994364738464,
"rewards/margins_max": 0.10384353250265121,
"rewards/margins_min": 0.05833636596798897,
"rewards/margins_std": 0.032178424298763275,
"rewards/rejected": -0.059587668627500534,
"step": 130
},
{
"epoch": 0.1,
"grad_norm": 5.451118326133565,
"learning_rate": 4.999868030671756e-07,
"logits/chosen": -0.9526296854019165,
"logits/rejected": -0.9190389513969421,
"logps/chosen": -236.9579620361328,
"logps/rejected": -269.78240966796875,
"loss": 0.6586,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015391260385513306,
"rewards/margins": 0.07113742083311081,
"rewards/margins_max": 0.10363912582397461,
"rewards/margins_min": 0.0386357307434082,
"rewards/margins_std": 0.04596434161067009,
"rewards/rejected": -0.0557461753487587,
"step": 140
},
{
"epoch": 0.11,
"grad_norm": 5.767935898839982,
"learning_rate": 4.998383535732973e-07,
"logits/chosen": -1.1545963287353516,
"logits/rejected": -1.3083815574645996,
"logps/chosen": -272.58392333984375,
"logps/rejected": -251.5518798828125,
"loss": 0.637,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.027133097872138023,
"rewards/margins": 0.1410999596118927,
"rewards/margins_max": 0.18325701355934143,
"rewards/margins_min": 0.09894292801618576,
"rewards/margins_std": 0.05961906909942627,
"rewards/rejected": -0.11396688222885132,
"step": 150
},
{
"epoch": 0.12,
"grad_norm": 9.075069261969173,
"learning_rate": 4.995250566954361e-07,
"logits/chosen": -1.2339075803756714,
"logits/rejected": -1.3427120447158813,
"logps/chosen": -278.045654296875,
"logps/rejected": -249.33016967773438,
"loss": 0.621,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.021242624148726463,
"rewards/margins": 0.1352781355381012,
"rewards/margins_max": 0.18264132738113403,
"rewards/margins_min": 0.08791494369506836,
"rewards/margins_std": 0.06698166579008102,
"rewards/rejected": -0.11403550952672958,
"step": 160
},
{
"epoch": 0.13,
"grad_norm": 5.288881821825863,
"learning_rate": 4.990471191519357e-07,
"logits/chosen": -1.2296701669692993,
"logits/rejected": -1.3137729167938232,
"logps/chosen": -271.8497009277344,
"logps/rejected": -257.36285400390625,
"loss": 0.6231,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.027015607804059982,
"rewards/margins": 0.2098924219608307,
"rewards/margins_max": 0.28914040327072144,
"rewards/margins_min": 0.13064439594745636,
"rewards/margins_std": 0.1120736226439476,
"rewards/rejected": -0.1828767955303192,
"step": 170
},
{
"epoch": 0.13,
"grad_norm": 4.722529871025577,
"learning_rate": 4.984048562937129e-07,
"logits/chosen": -1.104107141494751,
"logits/rejected": -1.2799243927001953,
"logps/chosen": -267.16131591796875,
"logps/rejected": -320.7081298828125,
"loss": 0.599,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.009208987466990948,
"rewards/margins": 0.15969006717205048,
"rewards/margins_max": 0.21888110041618347,
"rewards/margins_min": 0.1004989966750145,
"rewards/margins_std": 0.08370877802371979,
"rewards/rejected": -0.15048107504844666,
"step": 180
},
{
"epoch": 0.14,
"grad_norm": 4.706882294745904,
"learning_rate": 4.975986918961825e-07,
"logits/chosen": -1.1564669609069824,
"logits/rejected": -1.3084397315979004,
"logps/chosen": -287.58294677734375,
"logps/rejected": -235.0350799560547,
"loss": 0.5751,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015282683074474335,
"rewards/margins": 0.2435847818851471,
"rewards/margins_max": 0.34512418508529663,
"rewards/margins_min": 0.14204536378383636,
"rewards/margins_std": 0.14359840750694275,
"rewards/rejected": -0.22830209136009216,
"step": 190
},
{
"epoch": 0.15,
"grad_norm": 6.10064839157769,
"learning_rate": 4.966291578796448e-07,
"logits/chosen": -1.2383778095245361,
"logits/rejected": -1.2699321508407593,
"logps/chosen": -246.54550170898438,
"logps/rejected": -299.7005920410156,
"loss": 0.5468,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00011487379379104823,
"rewards/margins": 0.3421292304992676,
"rewards/margins_max": 0.5104770064353943,
"rewards/margins_min": 0.17378148436546326,
"rewards/margins_std": 0.23807969689369202,
"rewards/rejected": -0.3420143723487854,
"step": 200
},
{
"epoch": 0.15,
"eval_logits/chosen": -1.1466065645217896,
"eval_logits/rejected": -1.1508780717849731,
"eval_logps/chosen": -357.1989440917969,
"eval_logps/rejected": -350.5012512207031,
"eval_loss": 0.679348349571228,
"eval_rewards/accuracies": 0.579365074634552,
"eval_rewards/chosen": -0.11363494396209717,
"eval_rewards/margins": 0.029568513855338097,
"eval_rewards/margins_max": 0.24946285784244537,
"eval_rewards/margins_min": -0.1965206265449524,
"eval_rewards/margins_std": 0.1510881930589676,
"eval_rewards/rejected": -0.143203467130661,
"eval_runtime": 417.1858,
"eval_samples_per_second": 9.588,
"eval_steps_per_second": 0.151,
"step": 200
},
{
"epoch": 0.15,
"grad_norm": 7.130935509068585,
"learning_rate": 4.954968939583149e-07,
"logits/chosen": -0.82276850938797,
"logits/rejected": -1.0703377723693848,
"logps/chosen": -308.36981201171875,
"logps/rejected": -285.35321044921875,
"loss": 0.5356,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.06309916079044342,
"rewards/margins": 0.28919515013694763,
"rewards/margins_max": 0.4261881709098816,
"rewards/margins_min": 0.1522020846605301,
"rewards/margins_std": 0.1937374323606491,
"rewards/rejected": -0.35229426622390747,
"step": 210
},
{
"epoch": 0.16,
"grad_norm": 5.18218978578797,
"learning_rate": 4.942026472182297e-07,
"logits/chosen": -1.133894681930542,
"logits/rejected": -0.9819344282150269,
"logps/chosen": -357.5079345703125,
"logps/rejected": -290.6125183105469,
"loss": 0.5253,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12442765384912491,
"rewards/margins": 0.420942485332489,
"rewards/margins_max": 0.6367592215538025,
"rewards/margins_min": 0.20512573421001434,
"rewards/margins_std": 0.30521097779273987,
"rewards/rejected": -0.5453701615333557,
"step": 220
},
{
"epoch": 0.17,
"grad_norm": 6.451783890738213,
"learning_rate": 4.92747271624308e-07,
"logits/chosen": -1.1002264022827148,
"logits/rejected": -1.1289845705032349,
"logps/chosen": -307.14483642578125,
"logps/rejected": -330.2859802246094,
"loss": 0.494,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.14728474617004395,
"rewards/margins": 0.6978201866149902,
"rewards/margins_max": 1.030912160873413,
"rewards/margins_min": 0.36472827196121216,
"rewards/margins_std": 0.47106313705444336,
"rewards/rejected": -0.845104992389679,
"step": 230
},
{
"epoch": 0.18,
"grad_norm": 6.092745398297892,
"learning_rate": 4.911317274568909e-07,
"logits/chosen": -1.1411150693893433,
"logits/rejected": -1.1094478368759155,
"logps/chosen": -294.82550048828125,
"logps/rejected": -408.50970458984375,
"loss": 0.4335,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.1695319563150406,
"rewards/margins": 0.7641543745994568,
"rewards/margins_max": 1.1791099309921265,
"rewards/margins_min": 0.34919896721839905,
"rewards/margins_std": 0.586835503578186,
"rewards/rejected": -0.933686375617981,
"step": 240
},
{
"epoch": 0.18,
"grad_norm": 13.168192652840903,
"learning_rate": 4.89357080678133e-07,
"logits/chosen": -1.0950664281845093,
"logits/rejected": -1.240697979927063,
"logps/chosen": -269.51092529296875,
"logps/rejected": -296.14837646484375,
"loss": 0.4457,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.32782456278800964,
"rewards/margins": 0.6738765835762024,
"rewards/margins_max": 0.9242515563964844,
"rewards/margins_min": 0.423501580953598,
"rewards/margins_std": 0.35408374667167664,
"rewards/rejected": -1.0017011165618896,
"step": 250
},
{
"epoch": 0.19,
"grad_norm": 5.63734344760071,
"learning_rate": 4.874245022286637e-07,
"logits/chosen": -1.1380219459533691,
"logits/rejected": -0.8845139741897583,
"logps/chosen": -245.44686889648438,
"logps/rejected": -377.0203552246094,
"loss": 0.4311,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.39118385314941406,
"rewards/margins": 0.8953431844711304,
"rewards/margins_max": 1.3878755569458008,
"rewards/margins_min": 0.40281087160110474,
"rewards/margins_std": 0.6965457797050476,
"rewards/rejected": -1.2865270376205444,
"step": 260
},
{
"epoch": 0.2,
"grad_norm": 5.2265046259602705,
"learning_rate": 4.853352672549815e-07,
"logits/chosen": -0.9493010640144348,
"logits/rejected": -0.9017621874809265,
"logps/chosen": -434.3206481933594,
"logps/rejected": -370.5262451171875,
"loss": 0.4015,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5151541829109192,
"rewards/margins": 0.754838764667511,
"rewards/margins_max": 1.1408073902130127,
"rewards/margins_min": 0.3688700795173645,
"rewards/margins_std": 0.5458420515060425,
"rewards/rejected": -1.2699930667877197,
"step": 270
},
{
"epoch": 0.21,
"grad_norm": 9.133504257567045,
"learning_rate": 4.830907542680918e-07,
"logits/chosen": -1.0836373567581177,
"logits/rejected": -0.9045012593269348,
"logps/chosen": -264.9966125488281,
"logps/rejected": -428.46539306640625,
"loss": 0.3691,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.5287370681762695,
"rewards/margins": 2.0548110008239746,
"rewards/margins_max": 3.620879650115967,
"rewards/margins_min": 0.48874226212501526,
"rewards/margins_std": 2.2147555351257324,
"rewards/rejected": -2.583548069000244,
"step": 280
},
{
"epoch": 0.21,
"grad_norm": 5.917073426239516,
"learning_rate": 4.806924442339425e-07,
"logits/chosen": -1.0086328983306885,
"logits/rejected": -0.8821426630020142,
"logps/chosen": -305.4242248535156,
"logps/rejected": -435.61737060546875,
"loss": 0.3813,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.40681153535842896,
"rewards/margins": 1.1146458387374878,
"rewards/margins_max": 1.7330601215362549,
"rewards/margins_min": 0.4962318539619446,
"rewards/margins_std": 0.8745697140693665,
"rewards/rejected": -1.5214574337005615,
"step": 290
},
{
"epoch": 0.22,
"grad_norm": 7.285903481113855,
"learning_rate": 4.781419195962598e-07,
"logits/chosen": -0.997855544090271,
"logits/rejected": -0.9541902542114258,
"logps/chosen": -299.9017639160156,
"logps/rejected": -388.34246826171875,
"loss": 0.3597,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.6117764711380005,
"rewards/margins": 0.7978827953338623,
"rewards/margins_max": 1.1113024950027466,
"rewards/margins_min": 0.4844631552696228,
"rewards/margins_std": 0.44324231147766113,
"rewards/rejected": -1.4096593856811523,
"step": 300
},
{
"epoch": 0.22,
"eval_logits/chosen": -1.0628585815429688,
"eval_logits/rejected": -1.051159143447876,
"eval_logps/chosen": -439.3020324707031,
"eval_logps/rejected": -442.590576171875,
"eval_loss": 0.6787940859794617,
"eval_rewards/accuracies": 0.5714285969734192,
"eval_rewards/chosen": -0.9346656203269958,
"eval_rewards/margins": 0.12943138182163239,
"eval_rewards/margins_max": 1.008405089378357,
"eval_rewards/margins_min": -0.7319620251655579,
"eval_rewards/margins_std": 0.5778602361679077,
"eval_rewards/rejected": -1.0640968084335327,
"eval_runtime": 418.5023,
"eval_samples_per_second": 9.558,
"eval_steps_per_second": 0.151,
"step": 300
},
{
"epoch": 0.23,
"grad_norm": 10.782754556563047,
"learning_rate": 4.754408632324253e-07,
"logits/chosen": -1.1973422765731812,
"logits/rejected": -0.9350277781486511,
"logps/chosen": -318.24627685546875,
"logps/rejected": -525.6881103515625,
"loss": 0.3331,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.7586840987205505,
"rewards/margins": 2.185839891433716,
"rewards/margins_max": 2.856729745864868,
"rewards/margins_min": 1.5149496793746948,
"rewards/margins_std": 0.9487816691398621,
"rewards/rejected": -2.944523811340332,
"step": 310
},
{
"epoch": 0.24,
"grad_norm": 5.923061735298404,
"learning_rate": 4.725910573430866e-07,
"logits/chosen": -1.0679926872253418,
"logits/rejected": -0.945013165473938,
"logps/chosen": -365.65472412109375,
"logps/rejected": -405.1241760253906,
"loss": 0.3627,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.8854155540466309,
"rewards/margins": 0.7383102178573608,
"rewards/margins_max": 1.1103546619415283,
"rewards/margins_min": 0.3662659227848053,
"rewards/margins_std": 0.5261501669883728,
"rewards/rejected": -1.6237256526947021,
"step": 320
},
{
"epoch": 0.24,
"grad_norm": 7.098229956454526,
"learning_rate": 4.6959438227623293e-07,
"logits/chosen": -1.1373931169509888,
"logits/rejected": -0.862761378288269,
"logps/chosen": -276.69671630859375,
"logps/rejected": -535.3623046875,
"loss": 0.2945,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7917782068252563,
"rewards/margins": 2.616485595703125,
"rewards/margins_max": 4.648871421813965,
"rewards/margins_min": 0.5841000080108643,
"rewards/margins_std": 2.874227523803711,
"rewards/rejected": -3.40826416015625,
"step": 330
},
{
"epoch": 0.25,
"grad_norm": 8.024599779277368,
"learning_rate": 4.664528152865105e-07,
"logits/chosen": -0.7721256613731384,
"logits/rejected": -0.8172466158866882,
"logps/chosen": -349.3388671875,
"logps/rejected": -486.68597412109375,
"loss": 0.2734,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.1078470945358276,
"rewards/margins": 1.6050605773925781,
"rewards/margins_max": 2.6409249305725098,
"rewards/margins_min": 0.5691961646080017,
"rewards/margins_std": 1.4649332761764526,
"rewards/rejected": -2.712907552719116,
"step": 340
},
{
"epoch": 0.26,
"grad_norm": 7.882556555729322,
"learning_rate": 4.6316842923059816e-07,
"logits/chosen": -1.0482970476150513,
"logits/rejected": -0.8200104832649231,
"logps/chosen": -331.43133544921875,
"logps/rejected": -785.7190551757812,
"loss": 0.3029,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.9469194412231445,
"rewards/margins": 4.725480556488037,
"rewards/margins_max": 8.024388313293457,
"rewards/margins_min": 1.4265724420547485,
"rewards/margins_std": 4.665360450744629,
"rewards/rejected": -5.672399997711182,
"step": 350
},
{
"epoch": 0.26,
"grad_norm": 11.928720576155937,
"learning_rate": 4.5974339119950334e-07,
"logits/chosen": -0.9947048425674438,
"logits/rejected": -0.8432388305664062,
"logps/chosen": -433.1314392089844,
"logps/rejected": -583.6080932617188,
"loss": 0.2952,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.5369694232940674,
"rewards/margins": 1.8583523035049438,
"rewards/margins_max": 2.645268678665161,
"rewards/margins_min": 1.0714359283447266,
"rewards/margins_std": 1.1128677129745483,
"rewards/rejected": -3.3953216075897217,
"step": 360
},
{
"epoch": 0.27,
"grad_norm": 5.540434948793406,
"learning_rate": 4.5617996108867997e-07,
"logits/chosen": -0.8581298589706421,
"logits/rejected": -0.3961424231529236,
"logps/chosen": -412.405517578125,
"logps/rejected": -812.7819213867188,
"loss": 0.2262,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.617913007736206,
"rewards/margins": 4.383803844451904,
"rewards/margins_max": 7.294039249420166,
"rewards/margins_min": 1.4735687971115112,
"rewards/margins_std": 4.115694522857666,
"rewards/rejected": -6.001717567443848,
"step": 370
},
{
"epoch": 0.28,
"grad_norm": 17.370609516247765,
"learning_rate": 4.5248049010691304e-07,
"logits/chosen": -1.0891549587249756,
"logits/rejected": -0.69083172082901,
"logps/chosen": -347.2943420410156,
"logps/rejected": -703.4866943359375,
"loss": 0.2504,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3099722862243652,
"rewards/margins": 3.3620052337646484,
"rewards/margins_max": 5.631108283996582,
"rewards/margins_min": 1.092902421951294,
"rewards/margins_std": 3.208995819091797,
"rewards/rejected": -4.6719770431518555,
"step": 380
},
{
"epoch": 0.29,
"grad_norm": 9.292751258662012,
"learning_rate": 4.486474192249533e-07,
"logits/chosen": -1.0247005224227905,
"logits/rejected": -0.6028069853782654,
"logps/chosen": -442.56671142578125,
"logps/rejected": -660.4315185546875,
"loss": 0.2063,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4350707530975342,
"rewards/margins": 2.895498514175415,
"rewards/margins_max": 3.9468486309051514,
"rewards/margins_min": 1.8441476821899414,
"rewards/margins_std": 1.4868338108062744,
"rewards/rejected": -4.330569267272949,
"step": 390
},
{
"epoch": 0.29,
"grad_norm": 5.8794814274178755,
"learning_rate": 4.4468327756492504e-07,
"logits/chosen": -0.7380314469337463,
"logits/rejected": -0.5135469436645508,
"logps/chosen": -366.635986328125,
"logps/rejected": -607.7274780273438,
"loss": 0.2059,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.5760023593902588,
"rewards/margins": 2.5897469520568848,
"rewards/margins_max": 3.876375913619995,
"rewards/margins_min": 1.3031187057495117,
"rewards/margins_std": 1.8195674419403076,
"rewards/rejected": -4.165749549865723,
"step": 400
},
{
"epoch": 0.29,
"eval_logits/chosen": -0.8807379603385925,
"eval_logits/rejected": -0.8695055842399597,
"eval_logps/chosen": -542.6320190429688,
"eval_logps/rejected": -566.7861938476562,
"eval_loss": 0.7172051072120667,
"eval_rewards/accuracies": 0.5972222089767456,
"eval_rewards/chosen": -1.9679654836654663,
"eval_rewards/margins": 0.3380873501300812,
"eval_rewards/margins_max": 2.344252109527588,
"eval_rewards/margins_min": -1.388581395149231,
"eval_rewards/margins_std": 1.2205023765563965,
"eval_rewards/rejected": -2.3060529232025146,
"eval_runtime": 415.548,
"eval_samples_per_second": 9.626,
"eval_steps_per_second": 0.152,
"step": 400
},
{
"epoch": 0.3,
"grad_norm": 11.887878225278437,
"learning_rate": 4.405906807315705e-07,
"logits/chosen": -0.7631363868713379,
"logits/rejected": -0.14442148804664612,
"logps/chosen": -412.6502990722656,
"logps/rejected": -617.9203491210938,
"loss": 0.1867,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8327445983886719,
"rewards/margins": 2.406796932220459,
"rewards/margins_max": 3.8745861053466797,
"rewards/margins_min": 0.9390074014663696,
"rewards/margins_std": 2.075767755508423,
"rewards/rejected": -4.239541530609131,
"step": 410
},
{
"epoch": 0.31,
"grad_norm": 7.6667134274072195,
"learning_rate": 4.363723290864314e-07,
"logits/chosen": -0.8663452863693237,
"logits/rejected": -0.10104439407587051,
"logps/chosen": -507.49078369140625,
"logps/rejected": -824.9513549804688,
"loss": 0.221,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1396355628967285,
"rewards/margins": 4.1518402099609375,
"rewards/margins_max": 6.126175403594971,
"rewards/margins_min": 2.1775054931640625,
"rewards/margins_std": 2.792131185531616,
"rewards/rejected": -6.291476249694824,
"step": 420
},
{
"epoch": 0.32,
"grad_norm": 15.491732727187143,
"learning_rate": 4.3203100596610723e-07,
"logits/chosen": -0.5918745398521423,
"logits/rejected": -0.1715858429670334,
"logps/chosen": -453.0254821777344,
"logps/rejected": -597.4471435546875,
"loss": 0.1938,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7686259746551514,
"rewards/margins": 2.167701005935669,
"rewards/margins_max": 3.2129874229431152,
"rewards/margins_min": 1.1224141120910645,
"rewards/margins_std": 1.4782588481903076,
"rewards/rejected": -3.9363269805908203,
"step": 430
},
{
"epoch": 0.32,
"grad_norm": 8.708872027507127,
"learning_rate": 4.2756957584576436e-07,
"logits/chosen": -0.584081768989563,
"logits/rejected": 0.096702441573143,
"logps/chosen": -451.47509765625,
"logps/rejected": -978.1886596679688,
"loss": 0.2022,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.9495502710342407,
"rewards/margins": 5.0804595947265625,
"rewards/margins_max": 8.66343879699707,
"rewards/margins_min": 1.4974806308746338,
"rewards/margins_std": 5.0670976638793945,
"rewards/rejected": -7.0300092697143555,
"step": 440
},
{
"epoch": 0.33,
"grad_norm": 7.538469505929578,
"learning_rate": 4.22990982449109e-07,
"logits/chosen": -0.6104982495307922,
"logits/rejected": -0.21484926342964172,
"logps/chosen": -472.439453125,
"logps/rejected": -757.8245849609375,
"loss": 0.1417,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.049053192138672,
"rewards/margins": 3.2169277667999268,
"rewards/margins_max": 4.436863422393799,
"rewards/margins_min": 1.9969921112060547,
"rewards/margins_std": 1.7252495288848877,
"rewards/rejected": -5.2659807205200195,
"step": 450
},
{
"epoch": 0.34,
"grad_norm": 10.825278124877386,
"learning_rate": 4.1829824680607104e-07,
"logits/chosen": -0.419607937335968,
"logits/rejected": 0.11389993131160736,
"logps/chosen": -435.0726623535156,
"logps/rejected": -784.734130859375,
"loss": 0.166,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.990121841430664,
"rewards/margins": 3.3705692291259766,
"rewards/margins_max": 5.105216026306152,
"rewards/margins_min": 1.6359226703643799,
"rewards/margins_std": 2.4531607627868652,
"rewards/rejected": -5.360690593719482,
"step": 460
},
{
"epoch": 0.35,
"grad_norm": 9.234625136932591,
"learning_rate": 4.134944652594794e-07,
"logits/chosen": -0.5118550062179565,
"logits/rejected": 0.10812608152627945,
"logps/chosen": -453.38848876953125,
"logps/rejected": -1031.4366455078125,
"loss": 0.1243,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.346630096435547,
"rewards/margins": 5.6441521644592285,
"rewards/margins_max": 8.810213088989258,
"rewards/margins_min": 2.4780914783477783,
"rewards/margins_std": 4.4774861335754395,
"rewards/rejected": -7.990782260894775,
"step": 470
},
{
"epoch": 0.35,
"grad_norm": 6.8345938121765775,
"learning_rate": 4.085828074220451e-07,
"logits/chosen": -0.4821593165397644,
"logits/rejected": 0.33621591329574585,
"logps/chosen": -612.152587890625,
"logps/rejected": -944.8914794921875,
"loss": 0.142,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.5179717540740967,
"rewards/margins": 4.587340354919434,
"rewards/margins_max": 7.297093868255615,
"rewards/margins_min": 1.8775880336761475,
"rewards/margins_std": 3.8321690559387207,
"rewards/rejected": -7.105312347412109,
"step": 480
},
{
"epoch": 0.36,
"grad_norm": 11.122230946658236,
"learning_rate": 4.035665140849994e-07,
"logits/chosen": -0.2719888985157013,
"logits/rejected": 0.40051668882369995,
"logps/chosen": -519.849365234375,
"logps/rejected": -941.7233276367188,
"loss": 0.1233,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.556196689605713,
"rewards/margins": 4.605846881866455,
"rewards/margins_max": 6.096743106842041,
"rewards/margins_min": 3.1149520874023438,
"rewards/margins_std": 2.1084442138671875,
"rewards/rejected": -7.162044525146484,
"step": 490
},
{
"epoch": 0.37,
"grad_norm": 4.543083572509446,
"learning_rate": 3.984488950797678e-07,
"logits/chosen": -0.19994431734085083,
"logits/rejected": 0.6510161757469177,
"logps/chosen": -450.4979553222656,
"logps/rejected": -926.5679931640625,
"loss": 0.1354,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4891421794891357,
"rewards/margins": 4.708044052124023,
"rewards/margins_max": 7.425878047943115,
"rewards/margins_min": 1.9902098178863525,
"rewards/margins_std": 3.8435981273651123,
"rewards/rejected": -7.197185516357422,
"step": 500
},
{
"epoch": 0.37,
"eval_logits/chosen": -0.25537678599357605,
"eval_logits/rejected": -0.16171453893184662,
"eval_logps/chosen": -661.367431640625,
"eval_logps/rejected": -714.6080322265625,
"eval_loss": 0.8081530928611755,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": -3.155320167541504,
"eval_rewards/margins": 0.6289510130882263,
"eval_rewards/margins_max": 4.081821918487549,
"eval_rewards/margins_min": -2.2017109394073486,
"eval_rewards/margins_std": 2.03205943107605,
"eval_rewards/rejected": -3.784270763397217,
"eval_runtime": 416.2564,
"eval_samples_per_second": 9.609,
"eval_steps_per_second": 0.151,
"step": 500
},
{
"epoch": 0.38,
"grad_norm": 11.627659490001143,
"learning_rate": 3.9323332709408904e-07,
"logits/chosen": -0.09876732528209686,
"logits/rejected": 1.3991271257400513,
"logps/chosen": -600.6998291015625,
"logps/rejected": -968.8531494140625,
"loss": 0.1308,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.3052432537078857,
"rewards/margins": 4.294064998626709,
"rewards/margins_max": 6.973275184631348,
"rewards/margins_min": 1.614854097366333,
"rewards/margins_std": 3.788975954055786,
"rewards/rejected": -7.599307060241699,
"step": 510
},
{
"epoch": 0.38,
"grad_norm": 8.873005540995397,
"learning_rate": 3.879232514440227e-07,
"logits/chosen": -0.3379233479499817,
"logits/rejected": 0.6603206992149353,
"logps/chosen": -618.7060546875,
"logps/rejected": -1049.278076171875,
"loss": 0.1475,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.1586403846740723,
"rewards/margins": 4.834142208099365,
"rewards/margins_max": 6.6787214279174805,
"rewards/margins_min": 2.989562511444092,
"rewards/margins_std": 2.6086299419403076,
"rewards/rejected": -7.992783546447754,
"step": 520
},
{
"epoch": 0.39,
"grad_norm": 17.879342011641224,
"learning_rate": 3.825221718033129e-07,
"logits/chosen": 0.0034618079662323,
"logits/rejected": 0.864820122718811,
"logps/chosen": -471.9354553222656,
"logps/rejected": -985.2346801757812,
"loss": 0.1082,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.823118209838867,
"rewards/margins": 5.269505023956299,
"rewards/margins_max": 8.90275764465332,
"rewards/margins_min": 1.6362518072128296,
"rewards/margins_std": 5.138195991516113,
"rewards/rejected": -8.092622756958008,
"step": 530
},
{
"epoch": 0.4,
"grad_norm": 6.390466873902363,
"learning_rate": 3.7703365189160746e-07,
"logits/chosen": -0.07338769733905792,
"logits/rejected": 1.4749701023101807,
"logps/chosen": -539.89697265625,
"logps/rejected": -1210.6910400390625,
"loss": 0.089,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.919481039047241,
"rewards/margins": 7.211228370666504,
"rewards/margins_max": 11.77415943145752,
"rewards/margins_min": 2.6482949256896973,
"rewards/margins_std": 6.452960968017578,
"rewards/rejected": -10.130708694458008,
"step": 540
},
{
"epoch": 0.4,
"grad_norm": 12.554873275869042,
"learning_rate": 3.714613131230587e-07,
"logits/chosen": -0.22135767340660095,
"logits/rejected": 1.1000282764434814,
"logps/chosen": -720.9986572265625,
"logps/rejected": -1223.421630859375,
"loss": 0.1223,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.7738468647003174,
"rewards/margins": 5.606228828430176,
"rewards/margins_max": 8.233736038208008,
"rewards/margins_min": 2.9787204265594482,
"rewards/margins_std": 3.715857744216919,
"rewards/rejected": -9.380073547363281,
"step": 550
},
{
"epoch": 0.41,
"grad_norm": 40.923616793220184,
"learning_rate": 3.6580883221685533e-07,
"logits/chosen": -0.0870949998497963,
"logits/rejected": 1.078148603439331,
"logps/chosen": -505.99774169921875,
"logps/rejected": -1176.008544921875,
"loss": 0.0862,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.822312831878662,
"rewards/margins": 5.737250328063965,
"rewards/margins_max": 8.857365608215332,
"rewards/margins_min": 2.6171350479125977,
"rewards/margins_std": 4.412509918212891,
"rewards/rejected": -8.559562683105469,
"step": 560
},
{
"epoch": 0.42,
"grad_norm": 2.377000403316867,
"learning_rate": 3.6007993877126386e-07,
"logits/chosen": 0.25743845105171204,
"logits/rejected": 2.0459682941436768,
"logps/chosen": -640.0938110351562,
"logps/rejected": -1272.0159912109375,
"loss": 0.1269,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.934041976928711,
"rewards/margins": 6.4811530113220215,
"rewards/margins_max": 10.410442352294922,
"rewards/margins_min": 2.5518646240234375,
"rewards/margins_std": 5.556853294372559,
"rewards/rejected": -10.415196418762207,
"step": 570
},
{
"epoch": 0.43,
"grad_norm": 6.765929979770598,
"learning_rate": 3.5427841280277937e-07,
"logits/chosen": 0.19738076627254486,
"logits/rejected": 1.5706841945648193,
"logps/chosen": -643.2400512695312,
"logps/rejected": -1103.7618408203125,
"loss": 0.1024,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.439357280731201,
"rewards/margins": 4.518318176269531,
"rewards/margins_max": 6.311240196228027,
"rewards/margins_min": 2.725395441055298,
"rewards/margins_std": 2.5355746746063232,
"rewards/rejected": -7.957674980163574,
"step": 580
},
{
"epoch": 0.43,
"grad_norm": 2.3572788749229394,
"learning_rate": 3.484080822520096e-07,
"logits/chosen": 0.4655560553073883,
"logits/rejected": 1.286608099937439,
"logps/chosen": -555.6957397460938,
"logps/rejected": -1019.0916748046875,
"loss": 0.1491,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.497023820877075,
"rewards/margins": 4.476337432861328,
"rewards/margins_max": 6.756206512451172,
"rewards/margins_min": 2.1964690685272217,
"rewards/margins_std": 3.2242209911346436,
"rewards/rejected": -7.973361968994141,
"step": 590
},
{
"epoch": 0.44,
"grad_norm": 8.25918903118385,
"learning_rate": 3.4247282045793797e-07,
"logits/chosen": 0.2085554599761963,
"logits/rejected": 1.3560742139816284,
"logps/chosen": -595.1603393554688,
"logps/rejected": -1199.1165771484375,
"loss": 0.1327,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.877821445465088,
"rewards/margins": 6.124663352966309,
"rewards/margins_max": 9.755376815795898,
"rewards/margins_min": 2.493950843811035,
"rewards/margins_std": 5.134603500366211,
"rewards/rejected": -10.002485275268555,
"step": 600
},
{
"epoch": 0.44,
"eval_logits/chosen": 0.017259376123547554,
"eval_logits/rejected": 0.1599506437778473,
"eval_logps/chosen": -731.00927734375,
"eval_logps/rejected": -798.1055908203125,
"eval_loss": 0.8436357378959656,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": -3.851738452911377,
"eval_rewards/margins": 0.7675079107284546,
"eval_rewards/margins_max": 4.83132266998291,
"eval_rewards/margins_min": -2.431659460067749,
"eval_rewards/margins_std": 2.352627992630005,
"eval_rewards/rejected": -4.619246482849121,
"eval_runtime": 415.8421,
"eval_samples_per_second": 9.619,
"eval_steps_per_second": 0.151,
"step": 600
},
{
"epoch": 0.45,
"grad_norm": 13.982869383101132,
"learning_rate": 3.3647654360223144e-07,
"logits/chosen": -0.18186531960964203,
"logits/rejected": 1.947683572769165,
"logps/chosen": -636.12548828125,
"logps/rejected": -1468.92333984375,
"loss": 0.08,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.338965654373169,
"rewards/margins": 8.887590408325195,
"rewards/margins_max": 12.813148498535156,
"rewards/margins_min": 4.962031364440918,
"rewards/margins_std": 5.551577568054199,
"rewards/rejected": -12.226556777954102,
"step": 610
},
{
"epoch": 0.46,
"grad_norm": 24.68214704261548,
"learning_rate": 3.30423208125281e-07,
"logits/chosen": -0.13235849142074585,
"logits/rejected": 1.7915821075439453,
"logps/chosen": -697.5199584960938,
"logps/rejected": -1485.5936279296875,
"loss": 0.0765,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4117112159729004,
"rewards/margins": 8.921293258666992,
"rewards/margins_max": 12.249357223510742,
"rewards/margins_min": 5.593228340148926,
"rewards/margins_std": 4.7065935134887695,
"rewards/rejected": -12.333003044128418,
"step": 620
},
{
"epoch": 0.46,
"grad_norm": 10.905617995495655,
"learning_rate": 3.2431680811567833e-07,
"logits/chosen": -0.12053610384464264,
"logits/rejected": 1.8949730396270752,
"logps/chosen": -630.9464111328125,
"logps/rejected": -1220.925048828125,
"loss": 0.1229,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.5450587272644043,
"rewards/margins": 6.405971527099609,
"rewards/margins_max": 10.655710220336914,
"rewards/margins_min": 2.1562342643737793,
"rewards/margins_std": 6.010036945343018,
"rewards/rejected": -9.951030731201172,
"step": 630
},
{
"epoch": 0.47,
"grad_norm": 10.94150157360822,
"learning_rate": 3.1816137267485136e-07,
"logits/chosen": 0.027946263551712036,
"logits/rejected": 1.485925555229187,
"logps/chosen": -646.646728515625,
"logps/rejected": -1238.3758544921875,
"loss": 0.1477,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6477348804473877,
"rewards/margins": 6.220660209655762,
"rewards/margins_max": 9.265599250793457,
"rewards/margins_min": 3.1757211685180664,
"rewards/margins_std": 4.306193828582764,
"rewards/rejected": -9.86839485168457,
"step": 640
},
{
"epoch": 0.48,
"grad_norm": 17.595942677326722,
"learning_rate": 3.1196096325859815e-07,
"logits/chosen": -0.05433236435055733,
"logits/rejected": 2.2038755416870117,
"logps/chosen": -578.5730590820312,
"logps/rejected": -1498.58203125,
"loss": 0.1156,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.005871534347534,
"rewards/margins": 9.676332473754883,
"rewards/margins_max": 15.599041938781738,
"rewards/margins_min": 3.753622531890869,
"rewards/margins_std": 8.3759765625,
"rewards/rejected": -12.68220329284668,
"step": 650
},
{
"epoch": 0.49,
"grad_norm": 7.356684331269297,
"learning_rate": 3.057196709972727e-07,
"logits/chosen": 0.11046739667654037,
"logits/rejected": 2.175269365310669,
"logps/chosen": -674.2919921875,
"logps/rejected": -1267.6500244140625,
"loss": 0.0959,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1146275997161865,
"rewards/margins": 7.452083587646484,
"rewards/margins_max": 10.800088882446289,
"rewards/margins_min": 4.104078769683838,
"rewards/margins_std": 4.734793663024902,
"rewards/rejected": -10.56671142578125,
"step": 660
},
{
"epoch": 0.49,
"grad_norm": 7.038311259187171,
"learning_rate": 2.9944161399639086e-07,
"logits/chosen": 0.21353694796562195,
"logits/rejected": 1.7908731698989868,
"logps/chosen": -616.1519165039062,
"logps/rejected": -1157.595947265625,
"loss": 0.0791,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5106310844421387,
"rewards/margins": 5.999436855316162,
"rewards/margins_max": 8.261363983154297,
"rewards/margins_min": 3.7375106811523438,
"rewards/margins_std": 3.198847532272339,
"rewards/rejected": -9.510068893432617,
"step": 670
},
{
"epoch": 0.5,
"grad_norm": 3.3985205014158293,
"learning_rate": 2.9313093461943824e-07,
"logits/chosen": 0.07152876257896423,
"logits/rejected": 1.9080642461776733,
"logps/chosen": -658.859619140625,
"logps/rejected": -1418.1920166015625,
"loss": 0.0643,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6242880821228027,
"rewards/margins": 8.017306327819824,
"rewards/margins_max": 11.628385543823242,
"rewards/margins_min": 4.40622615814209,
"rewards/margins_std": 5.106837272644043,
"rewards/rejected": -11.641593933105469,
"step": 680
},
{
"epoch": 0.51,
"grad_norm": 12.970507914933444,
"learning_rate": 2.8679179675467104e-07,
"logits/chosen": 0.5070677995681763,
"logits/rejected": 2.8454136848449707,
"logps/chosen": -661.779296875,
"logps/rejected": -1588.948974609375,
"loss": 0.0704,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.405068397521973,
"rewards/margins": 9.361727714538574,
"rewards/margins_max": 15.706764221191406,
"rewards/margins_min": 3.016690731048584,
"rewards/margins_std": 8.973237037658691,
"rewards/rejected": -13.766797065734863,
"step": 690
},
{
"epoch": 0.52,
"grad_norm": 25.37176614242638,
"learning_rate": 2.80428383067716e-07,
"logits/chosen": -0.056868601590394974,
"logits/rejected": 2.1195578575134277,
"logps/chosen": -643.5035400390625,
"logps/rejected": -1405.5491943359375,
"loss": 0.0777,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8768184185028076,
"rewards/margins": 7.676672458648682,
"rewards/margins_max": 11.38581657409668,
"rewards/margins_min": 3.967529296875,
"rewards/margins_std": 5.245521545410156,
"rewards/rejected": -11.553489685058594,
"step": 700
},
{
"epoch": 0.52,
"eval_logits/chosen": 0.4162614345550537,
"eval_logits/rejected": 0.6300503015518188,
"eval_logps/chosen": -840.1605224609375,
"eval_logps/rejected": -929.0051879882812,
"eval_loss": 0.9893194437026978,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": -4.943249225616455,
"eval_rewards/margins": 0.9849926233291626,
"eval_rewards/margins_max": 6.353243827819824,
"eval_rewards/margins_min": -3.295872688293457,
"eval_rewards/margins_std": 3.1250360012054443,
"eval_rewards/rejected": -5.9282426834106445,
"eval_runtime": 421.7747,
"eval_samples_per_second": 9.484,
"eval_steps_per_second": 0.149,
"step": 700
},
{
"epoch": 0.52,
"grad_norm": 8.759540837366416,
"learning_rate": 2.7404489224177973e-07,
"logits/chosen": 0.6560094356536865,
"logits/rejected": 3.2553603649139404,
"logps/chosen": -783.5775756835938,
"logps/rejected": -1650.650146484375,
"loss": 0.1101,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.931197166442871,
"rewards/margins": 8.854988098144531,
"rewards/margins_max": 12.33712100982666,
"rewards/margins_min": 5.372857093811035,
"rewards/margins_std": 4.924478054046631,
"rewards/rejected": -13.786186218261719,
"step": 710
},
{
"epoch": 0.53,
"grad_norm": 53.838974553307395,
"learning_rate": 2.676455362072894e-07,
"logits/chosen": 0.9320627450942993,
"logits/rejected": 3.438016414642334,
"logps/chosen": -699.7535400390625,
"logps/rejected": -1655.8385009765625,
"loss": 0.0852,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.045767307281494,
"rewards/margins": 9.726736068725586,
"rewards/margins_max": 13.513631820678711,
"rewards/margins_min": 5.939839839935303,
"rewards/margins_std": 5.355479717254639,
"rewards/rejected": -14.772501945495605,
"step": 720
},
{
"epoch": 0.54,
"grad_norm": 0.8391615669250567,
"learning_rate": 2.612345373627937e-07,
"logits/chosen": 0.2621687650680542,
"logits/rejected": 1.9230273962020874,
"logps/chosen": -639.4342041015625,
"logps/rejected": -1445.03271484375,
"loss": 0.1804,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.050388336181641,
"rewards/margins": 8.062755584716797,
"rewards/margins_max": 11.662395477294922,
"rewards/margins_min": 4.4631171226501465,
"rewards/margins_std": 5.090658664703369,
"rewards/rejected": -12.113143920898438,
"step": 730
},
{
"epoch": 0.54,
"grad_norm": 18.77671464276547,
"learning_rate": 2.54816125788955e-07,
"logits/chosen": 0.5534690022468567,
"logits/rejected": 2.526615858078003,
"logps/chosen": -709.9898681640625,
"logps/rejected": -1459.970947265625,
"loss": 0.1361,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.665217399597168,
"rewards/margins": 7.5482282638549805,
"rewards/margins_max": 12.083941459655762,
"rewards/margins_min": 3.0125153064727783,
"rewards/margins_std": 6.414466857910156,
"rewards/rejected": -12.213445663452148,
"step": 740
},
{
"epoch": 0.55,
"grad_norm": 6.37822813578148,
"learning_rate": 2.4839453645747467e-07,
"logits/chosen": 0.2104567587375641,
"logits/rejected": 1.8120098114013672,
"logps/chosen": -643.4108276367188,
"logps/rejected": -1417.44921875,
"loss": 0.1312,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.7755866050720215,
"rewards/margins": 8.031917572021484,
"rewards/margins_max": 12.62381362915039,
"rewards/margins_min": 3.440018892288208,
"rewards/margins_std": 6.493924140930176,
"rewards/rejected": -11.807502746582031,
"step": 750
},
{
"epoch": 0.56,
"grad_norm": 16.8248388008373,
"learning_rate": 2.4197400643678987e-07,
"logits/chosen": 0.24539189040660858,
"logits/rejected": 1.6847679615020752,
"logps/chosen": -639.7948608398438,
"logps/rejected": -1011.7283935546875,
"loss": 0.0821,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.634861707687378,
"rewards/margins": 4.383325576782227,
"rewards/margins_max": 7.218289852142334,
"rewards/margins_min": 1.5483614206314087,
"rewards/margins_std": 4.009244918823242,
"rewards/rejected": -8.018186569213867,
"step": 760
},
{
"epoch": 0.57,
"grad_norm": 7.954736320138308,
"learning_rate": 2.3555877209638726e-07,
"logits/chosen": 0.0611066035926342,
"logits/rejected": 1.33302640914917,
"logps/chosen": -672.7412719726562,
"logps/rejected": -1782.3125,
"loss": 0.0906,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4519970417022705,
"rewards/margins": 11.562549591064453,
"rewards/margins_max": 20.07329559326172,
"rewards/margins_min": 3.0518016815185547,
"rewards/margins_std": 12.036015510559082,
"rewards/rejected": -15.014546394348145,
"step": 770
},
{
"epoch": 0.57,
"grad_norm": 11.05108228058454,
"learning_rate": 2.2915306631157817e-07,
"logits/chosen": 0.2885664105415344,
"logits/rejected": 2.206385612487793,
"logps/chosen": -648.3999633789062,
"logps/rejected": -1299.401123046875,
"loss": 0.1085,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7839324474334717,
"rewards/margins": 7.008673191070557,
"rewards/margins_max": 9.869766235351562,
"rewards/margins_min": 4.147579669952393,
"rewards/margins_std": 4.046196937561035,
"rewards/rejected": -10.792604446411133,
"step": 780
},
{
"epoch": 0.58,
"grad_norm": 32.49887802957626,
"learning_rate": 2.2276111567057887e-07,
"logits/chosen": 0.22940261662006378,
"logits/rejected": 1.6958719491958618,
"logps/chosen": -593.3724365234375,
"logps/rejected": -1174.9674072265625,
"loss": 0.1111,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.6294262409210205,
"rewards/margins": 5.80316162109375,
"rewards/margins_max": 8.796818733215332,
"rewards/margins_min": 2.8095040321350098,
"rewards/margins_std": 4.233671188354492,
"rewards/rejected": -9.432588577270508,
"step": 790
},
{
"epoch": 0.59,
"grad_norm": 7.06163362995566,
"learning_rate": 2.1638713768573936e-07,
"logits/chosen": 0.06335971504449844,
"logits/rejected": 1.4285287857055664,
"logps/chosen": -595.5140380859375,
"logps/rejected": -1295.677490234375,
"loss": 0.0638,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.581036329269409,
"rewards/margins": 7.286231994628906,
"rewards/margins_max": 11.22960090637207,
"rewards/margins_min": 3.342862606048584,
"rewards/margins_std": 5.576765537261963,
"rewards/rejected": -10.867268562316895,
"step": 800
},
{
"epoch": 0.59,
"eval_logits/chosen": 0.12438549101352692,
"eval_logits/rejected": 0.28890377283096313,
"eval_logps/chosen": -732.38525390625,
"eval_logps/rejected": -799.7516479492188,
"eval_loss": 0.8086485862731934,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": -3.8654978275299072,
"eval_rewards/margins": 0.7702099680900574,
"eval_rewards/margins_max": 4.502103328704834,
"eval_rewards/margins_min": -2.291940450668335,
"eval_rewards/margins_std": 2.2426791191101074,
"eval_rewards/rejected": -4.635707378387451,
"eval_runtime": 417.0386,
"eval_samples_per_second": 9.591,
"eval_steps_per_second": 0.151,
"step": 800
},
{
"epoch": 0.6,
"grad_norm": 5.3968517354258125,
"learning_rate": 2.100353380107609e-07,
"logits/chosen": 0.23273587226867676,
"logits/rejected": 1.9462811946868896,
"logps/chosen": -776.3011474609375,
"logps/rejected": -1441.837158203125,
"loss": 0.1,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.670359134674072,
"rewards/margins": 7.1968560218811035,
"rewards/margins_max": 11.221755981445312,
"rewards/margins_min": 3.171954393386841,
"rewards/margins_std": 5.6920695304870605,
"rewards/rejected": -11.86721420288086,
"step": 810
},
{
"epoch": 0.6,
"grad_norm": 15.03932252297939,
"learning_rate": 2.0370990766573698e-07,
"logits/chosen": -0.10733046382665634,
"logits/rejected": 1.8043702840805054,
"logps/chosen": -650.6616821289062,
"logps/rejected": -1616.010986328125,
"loss": 0.0713,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1738812923431396,
"rewards/margins": 10.541234970092773,
"rewards/margins_max": 15.04127311706543,
"rewards/margins_min": 6.041195392608643,
"rewards/margins_std": 6.364017009735107,
"rewards/rejected": -13.715115547180176,
"step": 820
},
{
"epoch": 0.61,
"grad_norm": 31.097886733723477,
"learning_rate": 1.974150202718513e-07,
"logits/chosen": 0.08039845526218414,
"logits/rejected": 2.343336582183838,
"logps/chosen": -534.8485717773438,
"logps/rejected": -1418.825439453125,
"loss": 0.0511,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1201188564300537,
"rewards/margins": 8.804471969604492,
"rewards/margins_max": 12.42898178100586,
"rewards/margins_min": 5.179962635040283,
"rewards/margins_std": 5.12583065032959,
"rewards/rejected": -11.924591064453125,
"step": 830
},
{
"epoch": 0.62,
"grad_norm": 16.60986174297272,
"learning_rate": 1.9115482929755445e-07,
"logits/chosen": 0.24223566055297852,
"logits/rejected": 1.6932157278060913,
"logps/chosen": -570.802978515625,
"logps/rejected": -1331.71533203125,
"loss": 0.0856,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.493419647216797,
"rewards/margins": 7.847373962402344,
"rewards/margins_max": 11.789865493774414,
"rewards/margins_min": 3.9048819541931152,
"rewards/margins_std": 5.575525760650635,
"rewards/rejected": -11.34079360961914,
"step": 840
},
{
"epoch": 0.63,
"grad_norm": 13.502668975247548,
"learning_rate": 1.8493346531803887e-07,
"logits/chosen": 0.48027992248535156,
"logits/rejected": 2.202148675918579,
"logps/chosen": -596.4915161132812,
"logps/rejected": -1282.6644287109375,
"loss": 0.0983,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.9702115058898926,
"rewards/margins": 6.948336124420166,
"rewards/margins_max": 9.718558311462402,
"rewards/margins_min": 4.178112506866455,
"rewards/margins_std": 3.9176864624023438,
"rewards/rejected": -10.918546676635742,
"step": 850
},
{
"epoch": 0.63,
"grad_norm": 23.139494234389517,
"learning_rate": 1.7875503328981807e-07,
"logits/chosen": 0.3601033091545105,
"logits/rejected": 2.474608898162842,
"logps/chosen": -652.9142456054688,
"logps/rejected": -1604.696533203125,
"loss": 0.0605,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.9101157188415527,
"rewards/margins": 9.86131477355957,
"rewards/margins_max": 14.181074142456055,
"rewards/margins_min": 5.541555881500244,
"rewards/margins_std": 6.1090617179870605,
"rewards/rejected": -13.771429061889648,
"step": 860
},
{
"epoch": 0.64,
"grad_norm": 14.349796836286524,
"learning_rate": 1.7262360984221006e-07,
"logits/chosen": 0.012769157998263836,
"logits/rejected": 1.9421314001083374,
"logps/chosen": -664.3881225585938,
"logps/rejected": -1434.8948974609375,
"loss": 0.1274,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.80120849609375,
"rewards/margins": 8.072778701782227,
"rewards/margins_max": 11.85603141784668,
"rewards/margins_min": 4.289526462554932,
"rewards/margins_std": 5.350326061248779,
"rewards/rejected": -11.873987197875977,
"step": 870
},
{
"epoch": 0.65,
"grad_norm": 15.485607186999017,
"learning_rate": 1.6654324058751175e-07,
"logits/chosen": 0.3775918483734131,
"logits/rejected": 1.973515510559082,
"logps/chosen": -713.2658081054688,
"logps/rejected": -1631.826904296875,
"loss": 0.0662,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.8164262771606445,
"rewards/margins": 9.047931671142578,
"rewards/margins_max": 12.981298446655273,
"rewards/margins_min": 5.114563941955566,
"rewards/margins_std": 5.562621116638184,
"rewards/rejected": -13.864356994628906,
"step": 880
},
{
"epoch": 0.65,
"grad_norm": 15.607832046954224,
"learning_rate": 1.6051793745163812e-07,
"logits/chosen": 0.6472679376602173,
"logits/rejected": 2.5574803352355957,
"logps/chosen": -689.5281982421875,
"logps/rejected": -1642.12109375,
"loss": 0.1011,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.642444610595703,
"rewards/margins": 9.717334747314453,
"rewards/margins_max": 15.204099655151367,
"rewards/margins_min": 4.230566501617432,
"rewards/margins_std": 7.75946044921875,
"rewards/rejected": -14.359777450561523,
"step": 890
},
{
"epoch": 0.66,
"grad_norm": 45.565136882193066,
"learning_rate": 1.5455167602698915e-07,
"logits/chosen": 0.06020700931549072,
"logits/rejected": 2.2921700477600098,
"logps/chosen": -727.0872192382812,
"logps/rejected": -1482.33837890625,
"loss": 0.0997,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.481110572814941,
"rewards/margins": 7.914282321929932,
"rewards/margins_max": 10.535211563110352,
"rewards/margins_min": 5.293350696563721,
"rewards/margins_std": 3.7065558433532715,
"rewards/rejected": -12.395392417907715,
"step": 900
},
{
"epoch": 0.66,
"eval_logits/chosen": 0.20550121366977692,
"eval_logits/rejected": 0.3917555809020996,
"eval_logps/chosen": -789.8953857421875,
"eval_logps/rejected": -866.7603149414062,
"eval_loss": 0.8639366030693054,
"eval_rewards/accuracies": 0.6269841194152832,
"eval_rewards/chosen": -4.4405999183654785,
"eval_rewards/margins": 0.8651944398880005,
"eval_rewards/margins_max": 5.159237861633301,
"eval_rewards/margins_min": -2.6377525329589844,
"eval_rewards/margins_std": 2.5658202171325684,
"eval_rewards/rejected": -5.305793762207031,
"eval_runtime": 419.7425,
"eval_samples_per_second": 9.53,
"eval_steps_per_second": 0.15,
"step": 900
},
{
"epoch": 0.67,
"grad_norm": 7.254262825774854,
"learning_rate": 1.4864839294928924e-07,
"logits/chosen": 0.2960719168186188,
"logits/rejected": 2.519636392593384,
"logps/chosen": -667.0858154296875,
"logps/rejected": -2054.51953125,
"loss": 0.1092,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -4.099366664886475,
"rewards/margins": 13.694231033325195,
"rewards/margins_max": 20.22653579711914,
"rewards/margins_min": 7.161923408508301,
"rewards/margins_std": 9.238077163696289,
"rewards/rejected": -17.793596267700195,
"step": 910
},
{
"epoch": 0.68,
"grad_norm": 19.00706315113973,
"learning_rate": 1.428119833001315e-07,
"logits/chosen": 0.011763498187065125,
"logits/rejected": 2.5436980724334717,
"logps/chosen": -683.8145751953125,
"logps/rejected": -1476.839111328125,
"loss": 0.0479,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.7376608848571777,
"rewards/margins": 9.0157470703125,
"rewards/margins_max": 13.42829418182373,
"rewards/margins_min": 4.603199481964111,
"rewards/margins_std": 6.2402849197387695,
"rewards/rejected": -12.75340747833252,
"step": 920
},
{
"epoch": 0.68,
"grad_norm": 2.449628285920275,
"learning_rate": 1.370462980369401e-07,
"logits/chosen": 0.11705155670642853,
"logits/rejected": 1.5357266664505005,
"logps/chosen": -766.4974365234375,
"logps/rejected": -1289.979248046875,
"loss": 0.0636,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.693875312805176,
"rewards/margins": 5.769114017486572,
"rewards/margins_max": 7.741427421569824,
"rewards/margins_min": 3.796800136566162,
"rewards/margins_std": 2.7892730236053467,
"rewards/rejected": -10.462987899780273,
"step": 930
},
{
"epoch": 0.69,
"grad_norm": 11.213992357762015,
"learning_rate": 1.3135514145204606e-07,
"logits/chosen": 0.21615874767303467,
"logits/rejected": 2.0779476165771484,
"logps/chosen": -605.4188232421875,
"logps/rejected": -1608.945556640625,
"loss": 0.0971,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8029799461364746,
"rewards/margins": 10.208128929138184,
"rewards/margins_max": 17.389694213867188,
"rewards/margins_min": 3.0265650749206543,
"rewards/margins_std": 10.15626335144043,
"rewards/rejected": -14.011110305786133,
"step": 940
},
{
"epoch": 0.7,
"grad_norm": 4.371361045173521,
"learning_rate": 1.257422686625539e-07,
"logits/chosen": 0.16180220246315002,
"logits/rejected": 2.055144786834717,
"logps/chosen": -682.2508544921875,
"logps/rejected": -1589.9964599609375,
"loss": 0.0906,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.9322009086608887,
"rewards/margins": 9.378369331359863,
"rewards/margins_max": 14.781808853149414,
"rewards/margins_min": 3.9749279022216797,
"rewards/margins_std": 7.641619682312012,
"rewards/rejected": -13.310567855834961,
"step": 950
},
{
"epoch": 0.71,
"grad_norm": 3.4893980542106102,
"learning_rate": 1.2021138313265444e-07,
"logits/chosen": 0.11532745510339737,
"logits/rejected": 1.866121530532837,
"logps/chosen": -634.554931640625,
"logps/rejected": -1674.252197265625,
"loss": 0.1202,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.853950023651123,
"rewards/margins": 10.867055892944336,
"rewards/margins_max": 18.760677337646484,
"rewards/margins_min": 2.97343373298645,
"rewards/margins_std": 11.163267135620117,
"rewards/rejected": -14.7210054397583,
"step": 960
},
{
"epoch": 0.71,
"grad_norm": 14.447489915734623,
"learning_rate": 1.1476613423001974e-07,
"logits/chosen": 0.17886893451213837,
"logits/rejected": 1.89533269405365,
"logps/chosen": -677.0606079101562,
"logps/rejected": -1261.5345458984375,
"loss": 0.0658,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.194746971130371,
"rewards/margins": 5.769103050231934,
"rewards/margins_max": 7.97817325592041,
"rewards/margins_min": 3.560032606124878,
"rewards/margins_std": 3.1240971088409424,
"rewards/rejected": -9.963850021362305,
"step": 970
},
{
"epoch": 0.72,
"grad_norm": 3.9798658979228856,
"learning_rate": 1.0941011481789042e-07,
"logits/chosen": 0.034214410930871964,
"logits/rejected": 2.867272138595581,
"logps/chosen": -702.2564697265625,
"logps/rejected": -1857.8795166015625,
"loss": 0.0935,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.056910037994385,
"rewards/margins": 11.857443809509277,
"rewards/margins_max": 18.30853843688965,
"rewards/margins_min": 5.406346797943115,
"rewards/margins_std": 9.123228073120117,
"rewards/rejected": -15.91435432434082,
"step": 980
},
{
"epoch": 0.73,
"grad_norm": 27.849338662173917,
"learning_rate": 1.041468588844476e-07,
"logits/chosen": 0.4994427263736725,
"logits/rejected": 2.539013385772705,
"logps/chosen": -599.5453491210938,
"logps/rejected": -1590.7774658203125,
"loss": 0.0699,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.911860704421997,
"rewards/margins": 10.174264907836914,
"rewards/margins_max": 15.016085624694824,
"rewards/margins_min": 5.332446098327637,
"rewards/margins_std": 6.8473663330078125,
"rewards/rejected": -14.086126327514648,
"step": 990
},
{
"epoch": 0.74,
"grad_norm": 4.2150247037639375,
"learning_rate": 9.897983921102954e-08,
"logits/chosen": -0.2390742003917694,
"logits/rejected": 2.2101035118103027,
"logps/chosen": -670.8737182617188,
"logps/rejected": -1509.370361328125,
"loss": 0.0708,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.90120005607605,
"rewards/margins": 8.870689392089844,
"rewards/margins_max": 11.843083381652832,
"rewards/margins_min": 5.89829683303833,
"rewards/margins_std": 4.203598976135254,
"rewards/rejected": -12.771888732910156,
"step": 1000
},
{
"epoch": 0.74,
"eval_logits/chosen": 0.21985697746276855,
"eval_logits/rejected": 0.4062546491622925,
"eval_logps/chosen": -791.2946166992188,
"eval_logps/rejected": -865.1302490234375,
"eval_loss": 0.8618067502975464,
"eval_rewards/accuracies": 0.6230158805847168,
"eval_rewards/chosen": -4.454591751098633,
"eval_rewards/margins": 0.8349014520645142,
"eval_rewards/margins_max": 5.060412406921387,
"eval_rewards/margins_min": -2.622389078140259,
"eval_rewards/margins_std": 2.52128529548645,
"eval_rewards/rejected": -5.289493083953857,
"eval_runtime": 419.5466,
"eval_samples_per_second": 9.534,
"eval_steps_per_second": 0.15,
"step": 1000
},
{
"epoch": 0.74,
"grad_norm": 12.140211164365056,
"learning_rate": 9.391246508073433e-08,
"logits/chosen": 0.13034725189208984,
"logits/rejected": 2.0794267654418945,
"logps/chosen": -724.4019775390625,
"logps/rejected": -1571.5511474609375,
"loss": 0.0777,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.304561614990234,
"rewards/margins": 8.91108512878418,
"rewards/margins_max": 12.622480392456055,
"rewards/margins_min": 5.199688911437988,
"rewards/margins_std": 5.248705863952637,
"rewards/rejected": -13.215646743774414,
"step": 1010
},
{
"epoch": 0.75,
"grad_norm": 72.48315962813399,
"learning_rate": 8.894808002892037e-08,
"logits/chosen": 0.19714145362377167,
"logits/rejected": 2.8781895637512207,
"logps/chosen": -689.0614624023438,
"logps/rejected": -1635.4539794921875,
"loss": 0.0641,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -3.8019371032714844,
"rewards/margins": 10.48505687713623,
"rewards/margins_max": 15.840913772583008,
"rewards/margins_min": 5.129199981689453,
"rewards/margins_std": 7.5743255615234375,
"rewards/rejected": -14.286993026733398,
"step": 1020
},
{
"epoch": 0.76,
"grad_norm": 20.88616124929115,
"learning_rate": 8.408995963708756e-08,
"logits/chosen": -0.0833059698343277,
"logits/rejected": 2.3186755180358887,
"logps/chosen": -681.8640747070312,
"logps/rejected": -1602.0863037109375,
"loss": 0.0246,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.856149673461914,
"rewards/margins": 9.128388404846191,
"rewards/margins_max": 12.01569652557373,
"rewards/margins_min": 6.241078853607178,
"rewards/margins_std": 4.0832719802856445,
"rewards/rejected": -12.984537124633789,
"step": 1030
},
{
"epoch": 0.77,
"grad_norm": 9.834472583209813,
"learning_rate": 7.934130937159508e-08,
"logits/chosen": 0.17558620870113373,
"logits/rejected": 2.297236442565918,
"logps/chosen": -637.3060302734375,
"logps/rejected": -1326.9390869140625,
"loss": 0.0496,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.932767152786255,
"rewards/margins": 7.218419075012207,
"rewards/margins_max": 9.979570388793945,
"rewards/margins_min": 4.457267761230469,
"rewards/margins_std": 3.904857635498047,
"rewards/rejected": -11.151185989379883,
"step": 1040
},
{
"epoch": 0.77,
"grad_norm": 5.026095263611361,
"learning_rate": 7.470526246864364e-08,
"logits/chosen": 0.39160841703414917,
"logits/rejected": 2.559542179107666,
"logps/chosen": -693.7269287109375,
"logps/rejected": -1849.744873046875,
"loss": 0.0552,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.237768650054932,
"rewards/margins": 12.45046329498291,
"rewards/margins_max": 19.93360710144043,
"rewards/margins_min": 4.967319488525391,
"rewards/margins_std": 10.582763671875,
"rewards/rejected": -16.688232421875,
"step": 1050
},
{
"epoch": 0.78,
"grad_norm": 0.6591285293800628,
"learning_rate": 7.018487786691512e-08,
"logits/chosen": 0.43399763107299805,
"logits/rejected": 2.060253381729126,
"logps/chosen": -745.4591674804688,
"logps/rejected": -1831.240478515625,
"loss": 0.0678,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.024683475494385,
"rewards/margins": 10.875140190124512,
"rewards/margins_max": 17.002622604370117,
"rewards/margins_min": 4.747661113739014,
"rewards/margins_std": 8.66556453704834,
"rewards/rejected": -15.899823188781738,
"step": 1060
},
{
"epoch": 0.79,
"grad_norm": 4.119017563303306,
"learning_rate": 6.578313818923559e-08,
"logits/chosen": -0.07052882760763168,
"logits/rejected": 1.8699405193328857,
"logps/chosen": -909.0846557617188,
"logps/rejected": -1548.6923828125,
"loss": 0.0634,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.350946426391602,
"rewards/margins": 7.820859432220459,
"rewards/margins_max": 11.624895095825195,
"rewards/margins_min": 4.016822338104248,
"rewards/margins_std": 5.379720211029053,
"rewards/rejected": -13.171804428100586,
"step": 1070
},
{
"epoch": 0.79,
"grad_norm": 16.860241482971446,
"learning_rate": 6.15029477745925e-08,
"logits/chosen": 0.48959070444107056,
"logits/rejected": 2.1462438106536865,
"logps/chosen": -734.9025268554688,
"logps/rejected": -1803.1939697265625,
"loss": 0.0725,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.121659755706787,
"rewards/margins": 10.34645938873291,
"rewards/margins_max": 14.924234390258789,
"rewards/margins_min": 5.768682479858398,
"rewards/margins_std": 6.473954200744629,
"rewards/rejected": -15.468118667602539,
"step": 1080
},
{
"epoch": 0.8,
"grad_norm": 18.379765722708388,
"learning_rate": 5.734713076180486e-08,
"logits/chosen": 0.46901997923851013,
"logits/rejected": 3.454606294631958,
"logps/chosen": -741.1581420898438,
"logps/rejected": -1905.183349609375,
"loss": 0.0713,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.039034843444824,
"rewards/margins": 12.11182975769043,
"rewards/margins_max": 19.285795211791992,
"rewards/margins_min": 4.937865257263184,
"rewards/margins_std": 10.145517349243164,
"rewards/rejected": -17.15086555480957,
"step": 1090
},
{
"epoch": 0.81,
"grad_norm": 4.317359176747138,
"learning_rate": 5.3318429226110875e-08,
"logits/chosen": 0.19755136966705322,
"logits/rejected": 2.050144672393799,
"logps/chosen": -604.0868530273438,
"logps/rejected": -1733.5550537109375,
"loss": 0.141,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.046292304992676,
"rewards/margins": 11.252501487731934,
"rewards/margins_max": 16.934438705444336,
"rewards/margins_min": 5.57056188583374,
"rewards/margins_std": 8.03547477722168,
"rewards/rejected": -15.298794746398926,
"step": 1100
},
{
"epoch": 0.81,
"eval_logits/chosen": 0.3016913831233978,
"eval_logits/rejected": 0.5082818865776062,
"eval_logps/chosen": -832.3104858398438,
"eval_logps/rejected": -915.954833984375,
"eval_loss": 0.9049465656280518,
"eval_rewards/accuracies": 0.6190476417541504,
"eval_rewards/chosen": -4.864750385284424,
"eval_rewards/margins": 0.9329892992973328,
"eval_rewards/margins_max": 5.632690906524658,
"eval_rewards/margins_min": -2.8439128398895264,
"eval_rewards/margins_std": 2.7856106758117676,
"eval_rewards/rejected": -5.7977399826049805,
"eval_runtime": 414.0109,
"eval_samples_per_second": 9.662,
"eval_steps_per_second": 0.152,
"step": 1100
},
{
"epoch": 0.82,
"grad_norm": 13.4435984156611,
"learning_rate": 4.9419501369902026e-08,
"logits/chosen": 0.08746049553155899,
"logits/rejected": 2.6451172828674316,
"logps/chosen": -771.4244384765625,
"logps/rejected": -2024.484619140625,
"loss": 0.1408,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.441189289093018,
"rewards/margins": 13.507779121398926,
"rewards/margins_max": 19.457698822021484,
"rewards/margins_min": 7.557857513427734,
"rewards/margins_std": 8.414458274841309,
"rewards/rejected": -17.9489688873291,
"step": 1110
},
{
"epoch": 0.82,
"grad_norm": 1.4128692999239585,
"learning_rate": 4.5652919768798896e-08,
"logits/chosen": 0.4677937924861908,
"logits/rejected": 2.3705403804779053,
"logps/chosen": -793.5311279296875,
"logps/rejected": -1775.6380615234375,
"loss": 0.0608,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.224045276641846,
"rewards/margins": 10.134596824645996,
"rewards/margins_max": 15.679702758789062,
"rewards/margins_min": 4.589491844177246,
"rewards/margins_std": 7.8419623374938965,
"rewards/rejected": -15.358640670776367,
"step": 1120
},
{
"epoch": 0.83,
"grad_norm": 16.039453526164788,
"learning_rate": 4.2021169674223536e-08,
"logits/chosen": 0.2930324077606201,
"logits/rejected": 2.399545431137085,
"logps/chosen": -655.0755615234375,
"logps/rejected": -1648.030029296875,
"loss": 0.0511,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.887598752975464,
"rewards/margins": 10.443506240844727,
"rewards/margins_max": 14.695414543151855,
"rewards/margins_min": 6.191596984863281,
"rewards/margins_std": 6.013107776641846,
"rewards/rejected": -14.331106185913086,
"step": 1130
},
{
"epoch": 0.84,
"grad_norm": 26.479285274862587,
"learning_rate": 3.852664737359046e-08,
"logits/chosen": 0.3496669828891754,
"logits/rejected": 1.97479248046875,
"logps/chosen": -852.40380859375,
"logps/rejected": -1573.5230712890625,
"loss": 0.0768,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -5.772242546081543,
"rewards/margins": 7.9194207191467285,
"rewards/margins_max": 12.818387985229492,
"rewards/margins_min": 3.0204524993896484,
"rewards/margins_std": 6.928186893463135,
"rewards/rejected": -13.691662788391113,
"step": 1140
},
{
"epoch": 0.85,
"grad_norm": 0.7263792166932626,
"learning_rate": 3.5171658609197824e-08,
"logits/chosen": 0.1613047868013382,
"logits/rejected": 2.029664993286133,
"logps/chosen": -742.6275024414062,
"logps/rejected": -1609.7635498046875,
"loss": 0.1096,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.6217732429504395,
"rewards/margins": 8.67860221862793,
"rewards/margins_max": 13.895421981811523,
"rewards/margins_min": 3.4617819786071777,
"rewards/margins_std": 7.377697944641113,
"rewards/rejected": -13.300374984741211,
"step": 1150
},
{
"epoch": 0.85,
"grad_norm": 3.13150099340305,
"learning_rate": 3.195841705686139e-08,
"logits/chosen": 0.460742712020874,
"logits/rejected": 2.694736957550049,
"logps/chosen": -821.4349365234375,
"logps/rejected": -1898.295654296875,
"loss": 0.0821,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.314339637756348,
"rewards/margins": 11.54991626739502,
"rewards/margins_max": 18.18251609802246,
"rewards/margins_min": 4.917316436767578,
"rewards/margins_std": 9.379911422729492,
"rewards/rejected": -16.864253997802734,
"step": 1160
},
{
"epoch": 0.86,
"grad_norm": 16.312675595535207,
"learning_rate": 2.8889042865294837e-08,
"logits/chosen": 0.13087859749794006,
"logits/rejected": 2.484839916229248,
"logps/chosen": -702.7008056640625,
"logps/rejected": -1441.55078125,
"loss": 0.0469,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.257961750030518,
"rewards/margins": 7.842066287994385,
"rewards/margins_max": 10.642562866210938,
"rewards/margins_min": 5.041568756103516,
"rewards/margins_std": 3.960501194000244,
"rewards/rejected": -12.100028991699219,
"step": 1170
},
{
"epoch": 0.87,
"grad_norm": 9.055687386628646,
"learning_rate": 2.5965561257202036e-08,
"logits/chosen": 0.1169591173529625,
"logits/rejected": 2.362281560897827,
"logps/chosen": -763.2276611328125,
"logps/rejected": -1660.2099609375,
"loss": 0.0572,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.720892906188965,
"rewards/margins": 9.803942680358887,
"rewards/margins_max": 15.876733779907227,
"rewards/margins_min": 3.731149196624756,
"rewards/margins_std": 8.588226318359375,
"rewards/rejected": -14.524835586547852,
"step": 1180
},
{
"epoch": 0.88,
"grad_norm": 22.841895074273324,
"learning_rate": 2.318990119300218e-08,
"logits/chosen": 0.10627205669879913,
"logits/rejected": 1.2642805576324463,
"logps/chosen": -798.917724609375,
"logps/rejected": -2165.9775390625,
"loss": 0.0395,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.4411396980285645,
"rewards/margins": 13.126449584960938,
"rewards/margins_max": 20.357501983642578,
"rewards/margins_min": 5.8953962326049805,
"rewards/margins_std": 10.226253509521484,
"rewards/rejected": -18.567590713500977,
"step": 1190
},
{
"epoch": 0.88,
"grad_norm": 17.973583296727792,
"learning_rate": 2.0563894098070216e-08,
"logits/chosen": 0.15934190154075623,
"logits/rejected": 2.1497673988342285,
"logps/chosen": -712.0560302734375,
"logps/rejected": -1505.4547119140625,
"loss": 0.0775,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.412214756011963,
"rewards/margins": 8.303590774536133,
"rewards/margins_max": 12.088435173034668,
"rewards/margins_min": 4.5187482833862305,
"rewards/margins_std": 5.352576732635498,
"rewards/rejected": -12.715806007385254,
"step": 1200
},
{
"epoch": 0.88,
"eval_logits/chosen": 0.30742567777633667,
"eval_logits/rejected": 0.5172090530395508,
"eval_logps/chosen": -836.2312622070312,
"eval_logps/rejected": -922.0319213867188,
"eval_loss": 0.9049317836761475,
"eval_rewards/accuracies": 0.6210317611694336,
"eval_rewards/chosen": -4.903958320617676,
"eval_rewards/margins": 0.9545530080795288,
"eval_rewards/margins_max": 5.713037014007568,
"eval_rewards/margins_min": -2.831618309020996,
"eval_rewards/margins_std": 2.813220262527466,
"eval_rewards/rejected": -5.858510971069336,
"eval_runtime": 422.5993,
"eval_samples_per_second": 9.465,
"eval_steps_per_second": 0.149,
"step": 1200
},
{
"epoch": 0.89,
"grad_norm": 2.7225416780438763,
"learning_rate": 1.8089272654333353e-08,
"logits/chosen": 0.28706851601600647,
"logits/rejected": 1.9062206745147705,
"logps/chosen": -866.8541259765625,
"logps/rejected": -1701.005615234375,
"loss": 0.0693,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.099188804626465,
"rewards/margins": 9.302727699279785,
"rewards/margins_max": 13.995088577270508,
"rewards/margins_min": 4.61036491394043,
"rewards/margins_std": 6.6360015869140625,
"rewards/rejected": -14.40191650390625,
"step": 1210
},
{
"epoch": 0.9,
"grad_norm": 7.492427847668467,
"learning_rate": 1.5767669657019005e-08,
"logits/chosen": 0.21484322845935822,
"logits/rejected": 2.9490137100219727,
"logps/chosen": -665.4578857421875,
"logps/rejected": -1718.431640625,
"loss": 0.0694,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.03716516494751,
"rewards/margins": 11.106006622314453,
"rewards/margins_max": 14.720375061035156,
"rewards/margins_min": 7.491639137268066,
"rewards/margins_std": 5.11148738861084,
"rewards/rejected": -15.143171310424805,
"step": 1220
},
{
"epoch": 0.91,
"grad_norm": 14.252457056430137,
"learning_rate": 1.3600616937310267e-08,
"logits/chosen": 0.3399500250816345,
"logits/rejected": 2.5051798820495605,
"logps/chosen": -776.6029663085938,
"logps/rejected": -1890.706298828125,
"loss": 0.0533,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.925044059753418,
"rewards/margins": 11.241573333740234,
"rewards/margins_max": 16.437541961669922,
"rewards/margins_min": 6.045604228973389,
"rewards/margins_std": 7.348209381103516,
"rewards/rejected": -16.166616439819336,
"step": 1230
},
{
"epoch": 0.91,
"grad_norm": 3.254929425883996,
"learning_rate": 1.1589544351619047e-08,
"logits/chosen": 0.8039329648017883,
"logits/rejected": 3.354154109954834,
"logps/chosen": -724.2069091796875,
"logps/rejected": -2016.739501953125,
"loss": 0.0701,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.160962104797363,
"rewards/margins": 12.87867259979248,
"rewards/margins_max": 20.457225799560547,
"rewards/margins_min": 5.300119400024414,
"rewards/margins_std": 10.717691421508789,
"rewards/rejected": -18.03963279724121,
"step": 1240
},
{
"epoch": 0.92,
"grad_norm": 9.743725835120221,
"learning_rate": 9.735778838143749e-09,
"logits/chosen": 0.17006321251392365,
"logits/rejected": 3.252281904220581,
"logps/chosen": -771.3798828125,
"logps/rejected": -2618.41943359375,
"loss": 0.0852,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.744952201843262,
"rewards/margins": 18.770408630371094,
"rewards/margins_max": 27.798681259155273,
"rewards/margins_min": 9.742134094238281,
"rewards/margins_std": 12.767908096313477,
"rewards/rejected": -23.51535987854004,
"step": 1250
},
{
"epoch": 0.93,
"grad_norm": 9.000437498002796,
"learning_rate": 8.040543541333655e-09,
"logits/chosen": 0.2970607578754425,
"logits/rejected": 3.4422898292541504,
"logps/chosen": -716.0152587890625,
"logps/rejected": -1930.673095703125,
"loss": 0.0554,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.7243733406066895,
"rewards/margins": 12.035941123962402,
"rewards/margins_max": 17.656423568725586,
"rewards/margins_min": 6.415456295013428,
"rewards/margins_std": 7.9485650062561035,
"rewards/rejected": -16.760313034057617,
"step": 1260
},
{
"epoch": 0.93,
"grad_norm": 16.466144409333417,
"learning_rate": 6.504957004838746e-09,
"logits/chosen": -0.05619863420724869,
"logits/rejected": 1.9224863052368164,
"logps/chosen": -841.8850708007812,
"logps/rejected": -1936.7113037109375,
"loss": 0.0456,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.742644309997559,
"rewards/margins": 11.516191482543945,
"rewards/margins_max": 15.864044189453125,
"rewards/margins_min": 7.168337821960449,
"rewards/margins_std": 6.148792266845703,
"rewards/rejected": -16.258834838867188,
"step": 1270
},
{
"epoch": 0.94,
"grad_norm": 2.654070322101592,
"learning_rate": 5.130032433476483e-09,
"logits/chosen": 0.3038169741630554,
"logits/rejected": 2.8313422203063965,
"logps/chosen": -728.2089233398438,
"logps/rejected": -1743.801513671875,
"loss": 0.069,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.6006269454956055,
"rewards/margins": 11.062047004699707,
"rewards/margins_max": 16.25905418395996,
"rewards/margins_min": 5.8650407791137695,
"rewards/margins_std": 7.349676609039307,
"rewards/rejected": -15.662673950195312,
"step": 1280
},
{
"epoch": 0.95,
"grad_norm": 4.94775999947406,
"learning_rate": 3.916677024702858e-09,
"logits/chosen": 0.1287023425102234,
"logits/rejected": 2.0298779010772705,
"logps/chosen": -667.8201904296875,
"logps/rejected": -1396.031005859375,
"loss": 0.0567,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.1791486740112305,
"rewards/margins": 7.460590362548828,
"rewards/margins_max": 10.305280685424805,
"rewards/margins_min": 4.615899562835693,
"rewards/margins_std": 4.023000240325928,
"rewards/rejected": -11.639739990234375,
"step": 1290
},
{
"epoch": 0.96,
"grad_norm": 62.41949761633163,
"learning_rate": 2.865691370028761e-09,
"logits/chosen": 0.3163800835609436,
"logits/rejected": 2.587982416152954,
"logps/chosen": -711.7886962890625,
"logps/rejected": -1568.990234375,
"loss": 0.0464,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.575112342834473,
"rewards/margins": 9.028793334960938,
"rewards/margins_max": 13.794398307800293,
"rewards/margins_min": 4.263186454772949,
"rewards/margins_std": 6.739584922790527,
"rewards/rejected": -13.603904724121094,
"step": 1300
},
{
"epoch": 0.96,
"eval_logits/chosen": 0.2898733615875244,
"eval_logits/rejected": 0.49572646617889404,
"eval_logps/chosen": -832.4283447265625,
"eval_logps/rejected": -916.66357421875,
"eval_loss": 0.9016607403755188,
"eval_rewards/accuracies": 0.6230158805847168,
"eval_rewards/chosen": -4.8659281730651855,
"eval_rewards/margins": 0.9388992786407471,
"eval_rewards/margins_max": 5.651630401611328,
"eval_rewards/margins_min": -2.8163363933563232,
"eval_rewards/margins_std": 2.78544020652771,
"eval_rewards/rejected": -5.8048272132873535,
"eval_runtime": 417.6061,
"eval_samples_per_second": 9.578,
"eval_steps_per_second": 0.151,
"step": 1300
},
{
"epoch": 0.96,
"grad_norm": 1.1392819952008515,
"learning_rate": 1.977768926776896e-09,
"logits/chosen": 0.29715052247047424,
"logits/rejected": 2.052577018737793,
"logps/chosen": -763.3764038085938,
"logps/rejected": -1313.391845703125,
"loss": 0.0621,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.99444055557251,
"rewards/margins": 5.8611674308776855,
"rewards/margins_max": 7.457464694976807,
"rewards/margins_min": 4.264869213104248,
"rewards/margins_std": 2.2575066089630127,
"rewards/rejected": -10.855607986450195,
"step": 1310
},
{
"epoch": 0.97,
"grad_norm": 2.4664515698911433,
"learning_rate": 1.2534955605274233e-09,
"logits/chosen": 0.4122096002101898,
"logits/rejected": 3.4274659156799316,
"logps/chosen": -771.2412719726562,
"logps/rejected": -1840.775390625,
"loss": 0.0786,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.00337028503418,
"rewards/margins": 10.93709659576416,
"rewards/margins_max": 16.795883178710938,
"rewards/margins_min": 5.078312873840332,
"rewards/margins_std": 8.285572052001953,
"rewards/rejected": -15.940465927124023,
"step": 1320
},
{
"epoch": 0.98,
"grad_norm": 5.80559652045183,
"learning_rate": 6.933491585542351e-10,
"logits/chosen": 0.37182289361953735,
"logits/rejected": 3.1600046157836914,
"logps/chosen": -680.2762451171875,
"logps/rejected": -1665.175537109375,
"loss": 0.1695,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.444371223449707,
"rewards/margins": 10.164952278137207,
"rewards/margins_max": 14.587198257446289,
"rewards/margins_min": 5.742705821990967,
"rewards/margins_std": 6.254001140594482,
"rewards/rejected": -14.60932445526123,
"step": 1330
},
{
"epoch": 0.99,
"grad_norm": 8.610945717452623,
"learning_rate": 2.9769931450737694e-10,
"logits/chosen": 0.1386619508266449,
"logits/rejected": 2.0141379833221436,
"logps/chosen": -799.5162353515625,
"logps/rejected": -1834.666015625,
"loss": 0.0767,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.182085990905762,
"rewards/margins": 10.368020057678223,
"rewards/margins_max": 15.239529609680176,
"rewards/margins_min": 5.496510028839111,
"rewards/margins_std": 6.889355659484863,
"rewards/rejected": -15.550105094909668,
"step": 1340
},
{
"epoch": 0.99,
"grad_norm": 33.03425737131523,
"learning_rate": 6.680708454906425e-11,
"logits/chosen": 0.2811238169670105,
"logits/rejected": 1.941292405128479,
"logps/chosen": -751.3411254882812,
"logps/rejected": -1699.15625,
"loss": 0.0687,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.085223197937012,
"rewards/margins": 9.496126174926758,
"rewards/margins_max": 13.1726713180542,
"rewards/margins_min": 5.819581508636475,
"rewards/margins_std": 5.199418544769287,
"rewards/rejected": -14.58134937286377,
"step": 1350
},
{
"epoch": 1.0,
"step": 1359,
"total_flos": 0.0,
"train_loss": 0.21785820982226384,
"train_runtime": 12082.0351,
"train_samples_per_second": 1.8,
"train_steps_per_second": 0.112
}
],
"logging_steps": 10,
"max_steps": 1359,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}