eurus-7b-cost-UI-5e-7 / trainer_state.json
just1nseo's picture
Model save
f06cb12 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995451099317665,
"eval_steps": 100,
"global_step": 1648,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.5390625,
"learning_rate": 3.03030303030303e-09,
"logits/chosen": -3.4050943851470947,
"logits/rejected": -3.1368675231933594,
"logps/chosen": -118.80651092529297,
"logps/rejected": -84.5186767578125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 1.65625,
"learning_rate": 3.0303030303030305e-08,
"logits/chosen": -3.4118552207946777,
"logits/rejected": -3.234715700149536,
"logps/chosen": -112.32723236083984,
"logps/rejected": -153.78240966796875,
"loss": 0.693,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": 0.0004033078148495406,
"rewards/margins": 0.00041662290459498763,
"rewards/margins_max": 0.0021899566054344177,
"rewards/margins_min": -0.0013567109126597643,
"rewards/margins_std": 0.002507872646674514,
"rewards/rejected": -1.3315144315129146e-05,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.8828125,
"learning_rate": 6.060606060606061e-08,
"logits/chosen": -3.4354801177978516,
"logits/rejected": -3.176407814025879,
"logps/chosen": -118.2829360961914,
"logps/rejected": -184.0032958984375,
"loss": 0.6931,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.00016470803529955447,
"rewards/margins": 1.1057045412599109e-05,
"rewards/margins_max": 0.00216041412204504,
"rewards/margins_min": -0.002138300333172083,
"rewards/margins_std": 0.0030396501533687115,
"rewards/rejected": 0.0001536509662400931,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 1.8671875,
"learning_rate": 9.09090909090909e-08,
"logits/chosen": -3.4078497886657715,
"logits/rejected": -3.205293655395508,
"logps/chosen": -127.51212310791016,
"logps/rejected": -157.24716186523438,
"loss": 0.6925,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.00028022227343171835,
"rewards/margins": 0.0013433375861495733,
"rewards/margins_max": 0.0038831476122140884,
"rewards/margins_min": -0.0011964720906689763,
"rewards/margins_std": 0.003591833170503378,
"rewards/rejected": -0.0010631154291331768,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 1.71875,
"learning_rate": 1.2121212121212122e-07,
"logits/chosen": -3.4350059032440186,
"logits/rejected": -3.2142701148986816,
"logps/chosen": -121.0025634765625,
"logps/rejected": -145.43264770507812,
"loss": 0.6915,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.0012690603034570813,
"rewards/margins": 0.003167077898979187,
"rewards/margins_max": 0.005492820404469967,
"rewards/margins_min": 0.0008413357427343726,
"rewards/margins_std": 0.003289096523076296,
"rewards/rejected": -0.0018980179447680712,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 1.75,
"learning_rate": 1.5151515151515152e-07,
"logits/chosen": -3.4711899757385254,
"logits/rejected": -3.23637056350708,
"logps/chosen": -114.65794372558594,
"logps/rejected": -166.53250122070312,
"loss": 0.6909,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.0023323404602706432,
"rewards/margins": 0.004964292515069246,
"rewards/margins_max": 0.007555422373116016,
"rewards/margins_min": 0.0023731617256999016,
"rewards/margins_std": 0.0036644123028963804,
"rewards/rejected": -0.0026319522876292467,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 2.0,
"learning_rate": 1.818181818181818e-07,
"logits/chosen": -3.4538276195526123,
"logits/rejected": -3.1886672973632812,
"logps/chosen": -109.4487533569336,
"logps/rejected": -172.9461669921875,
"loss": 0.6898,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.003310429397970438,
"rewards/margins": 0.007032909896224737,
"rewards/margins_max": 0.011247309856116772,
"rewards/margins_min": 0.0028185099363327026,
"rewards/margins_std": 0.005960061680525541,
"rewards/rejected": -0.003722480731084943,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 1.8359375,
"learning_rate": 2.121212121212121e-07,
"logits/chosen": -3.4295284748077393,
"logits/rejected": -3.1960196495056152,
"logps/chosen": -125.6326904296875,
"logps/rejected": -177.14407348632812,
"loss": 0.688,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.0038899246137589216,
"rewards/margins": 0.009991476312279701,
"rewards/margins_max": 0.013599397614598274,
"rewards/margins_min": 0.006383554544299841,
"rewards/margins_std": 0.005102371331304312,
"rewards/rejected": -0.006101551465690136,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 1.65625,
"learning_rate": 2.4242424242424244e-07,
"logits/chosen": -3.4546008110046387,
"logits/rejected": -3.259620189666748,
"logps/chosen": -105.1754150390625,
"logps/rejected": -149.17739868164062,
"loss": 0.6864,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.004929172340780497,
"rewards/margins": 0.013479220680892467,
"rewards/margins_max": 0.019636893644928932,
"rewards/margins_min": 0.007321546785533428,
"rewards/margins_std": 0.008708265610039234,
"rewards/rejected": -0.008550046943128109,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 1.8828125,
"learning_rate": 2.727272727272727e-07,
"logits/chosen": -3.4643356800079346,
"logits/rejected": -3.227538585662842,
"logps/chosen": -122.521240234375,
"logps/rejected": -154.5928497314453,
"loss": 0.6834,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.008893799968063831,
"rewards/margins": 0.021132633090019226,
"rewards/margins_max": 0.027842596173286438,
"rewards/margins_min": 0.014422670006752014,
"rewards/margins_std": 0.00948932021856308,
"rewards/rejected": -0.012238833121955395,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 1.71875,
"learning_rate": 3.0303030303030305e-07,
"logits/chosen": -3.4493160247802734,
"logits/rejected": -3.2681262493133545,
"logps/chosen": -111.17362976074219,
"logps/rejected": -195.10665893554688,
"loss": 0.6813,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.009519929066300392,
"rewards/margins": 0.023902228102087975,
"rewards/margins_max": 0.0342855267226696,
"rewards/margins_min": 0.013518924824893475,
"rewards/margins_std": 0.014684207737445831,
"rewards/rejected": -0.014382297173142433,
"step": 100
},
{
"epoch": 0.07,
"grad_norm": 1.8125,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -3.3599190711975098,
"logits/rejected": -3.1862587928771973,
"logps/chosen": -117.963134765625,
"logps/rejected": -156.41465759277344,
"loss": 0.6783,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.008118102326989174,
"rewards/margins": 0.030242860317230225,
"rewards/margins_max": 0.040411077439785004,
"rewards/margins_min": 0.020074646919965744,
"rewards/margins_std": 0.014380025677382946,
"rewards/rejected": -0.0221247561275959,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 1.609375,
"learning_rate": 3.636363636363636e-07,
"logits/chosen": -3.4270176887512207,
"logits/rejected": -3.26020884513855,
"logps/chosen": -109.92674255371094,
"logps/rejected": -144.8209228515625,
"loss": 0.6757,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.010196666233241558,
"rewards/margins": 0.03683094307780266,
"rewards/margins_max": 0.0503312349319458,
"rewards/margins_min": 0.023330653086304665,
"rewards/margins_std": 0.019092293456196785,
"rewards/rejected": -0.026634279638528824,
"step": 120
},
{
"epoch": 0.08,
"grad_norm": 1.6484375,
"learning_rate": 3.939393939393939e-07,
"logits/chosen": -3.4294886589050293,
"logits/rejected": -3.2369658946990967,
"logps/chosen": -126.5963363647461,
"logps/rejected": -178.168212890625,
"loss": 0.6705,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.009229905903339386,
"rewards/margins": 0.043803971260786057,
"rewards/margins_max": 0.06445904076099396,
"rewards/margins_min": 0.023148905485868454,
"rewards/margins_std": 0.02921067550778389,
"rewards/rejected": -0.03457406908273697,
"step": 130
},
{
"epoch": 0.08,
"grad_norm": 1.7421875,
"learning_rate": 4.242424242424242e-07,
"logits/chosen": -3.441509246826172,
"logits/rejected": -3.2002804279327393,
"logps/chosen": -130.19882202148438,
"logps/rejected": -178.93299865722656,
"loss": 0.6643,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.0069100684486329556,
"rewards/margins": 0.05727902799844742,
"rewards/margins_max": 0.07939378917217255,
"rewards/margins_min": 0.03516425937414169,
"rewards/margins_std": 0.03127499669790268,
"rewards/rejected": -0.050368957221508026,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 2.125,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -3.466477155685425,
"logits/rejected": -3.275334119796753,
"logps/chosen": -124.30558776855469,
"logps/rejected": -181.0602569580078,
"loss": 0.6634,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.008361677639186382,
"rewards/margins": 0.06292758136987686,
"rewards/margins_max": 0.08513649553060532,
"rewards/margins_min": 0.04071866348385811,
"rewards/margins_std": 0.0314081534743309,
"rewards/rejected": -0.054565899074077606,
"step": 150
},
{
"epoch": 0.1,
"grad_norm": 1.9140625,
"learning_rate": 4.848484848484849e-07,
"logits/chosen": -3.3910937309265137,
"logits/rejected": -3.2401318550109863,
"logps/chosen": -99.83372497558594,
"logps/rejected": -162.27804565429688,
"loss": 0.6578,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.006624563131481409,
"rewards/margins": 0.06912867724895477,
"rewards/margins_max": 0.1061328873038292,
"rewards/margins_min": 0.032124463468790054,
"rewards/margins_std": 0.05233185365796089,
"rewards/rejected": -0.0625041052699089,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 1.9375,
"learning_rate": 4.999859762744229e-07,
"logits/chosen": -3.403299331665039,
"logits/rejected": -3.2240326404571533,
"logps/chosen": -101.01579284667969,
"logps/rejected": -159.98538208007812,
"loss": 0.6513,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.00876183807849884,
"rewards/margins": 0.08230775594711304,
"rewards/margins_max": 0.11581530421972275,
"rewards/margins_min": 0.048800211399793625,
"rewards/margins_std": 0.047386832535266876,
"rewards/rejected": -0.0735459253191948,
"step": 170
},
{
"epoch": 0.11,
"grad_norm": 1.953125,
"learning_rate": 4.998737959095448e-07,
"logits/chosen": -3.4143004417419434,
"logits/rejected": -3.1833884716033936,
"logps/chosen": -105.91117095947266,
"logps/rejected": -135.10708618164062,
"loss": 0.6477,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.002522200345993042,
"rewards/margins": 0.09561987221240997,
"rewards/margins_max": 0.14069953560829163,
"rewards/margins_min": 0.05054020881652832,
"rewards/margins_std": 0.06375227868556976,
"rewards/rejected": -0.09309767186641693,
"step": 180
},
{
"epoch": 0.12,
"grad_norm": 2.046875,
"learning_rate": 4.996494855203493e-07,
"logits/chosen": -3.47766375541687,
"logits/rejected": -3.207594633102417,
"logps/chosen": -113.1792221069336,
"logps/rejected": -175.5690460205078,
"loss": 0.6359,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.0006620381027460098,
"rewards/margins": 0.11444854736328125,
"rewards/margins_max": 0.15897879004478455,
"rewards/margins_min": 0.06991832703351974,
"rewards/margins_std": 0.06297525763511658,
"rewards/rejected": -0.11378651857376099,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 2.09375,
"learning_rate": 4.993131457653681e-07,
"logits/chosen": -3.4641525745391846,
"logits/rejected": -3.2808594703674316,
"logps/chosen": -100.21434020996094,
"logps/rejected": -156.946044921875,
"loss": 0.6389,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.01082407496869564,
"rewards/margins": 0.11547299474477768,
"rewards/margins_max": 0.16020886600017548,
"rewards/margins_min": 0.07073714584112167,
"rewards/margins_std": 0.06326606869697571,
"rewards/rejected": -0.12629708647727966,
"step": 200
},
{
"epoch": 0.13,
"grad_norm": 2.34375,
"learning_rate": 4.988649275759334e-07,
"logits/chosen": -3.428915500640869,
"logits/rejected": -3.1432971954345703,
"logps/chosen": -110.36918640136719,
"logps/rejected": -165.47640991210938,
"loss": 0.6294,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.012739740312099457,
"rewards/margins": 0.12643049657344818,
"rewards/margins_max": 0.18977002799510956,
"rewards/margins_min": 0.0630909651517868,
"rewards/margins_std": 0.0895756185054779,
"rewards/rejected": -0.13917024433612823,
"step": 210
},
{
"epoch": 0.13,
"grad_norm": 1.875,
"learning_rate": 4.983050320884483e-07,
"logits/chosen": -3.4887309074401855,
"logits/rejected": -3.2058892250061035,
"logps/chosen": -126.2535629272461,
"logps/rejected": -187.00015258789062,
"loss": 0.6223,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.01476545911282301,
"rewards/margins": 0.15231844782829285,
"rewards/margins_max": 0.2008267194032669,
"rewards/margins_min": 0.10381016880273819,
"rewards/margins_std": 0.06860103458166122,
"rewards/rejected": -0.16708388924598694,
"step": 220
},
{
"epoch": 0.14,
"grad_norm": 2.125,
"learning_rate": 4.976337105541267e-07,
"logits/chosen": -3.403496503829956,
"logits/rejected": -3.164135217666626,
"logps/chosen": -130.16421508789062,
"logps/rejected": -158.1027374267578,
"loss": 0.6215,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.04382479190826416,
"rewards/margins": 0.14249354600906372,
"rewards/margins_max": 0.20711453258991241,
"rewards/margins_min": 0.07787257432937622,
"rewards/margins_std": 0.09138786792755127,
"rewards/rejected": -0.1863183230161667,
"step": 230
},
{
"epoch": 0.15,
"grad_norm": 2.234375,
"learning_rate": 4.968512642262464e-07,
"logits/chosen": -3.423377513885498,
"logits/rejected": -3.2418792247772217,
"logps/chosen": -104.84086608886719,
"logps/rejected": -180.81430053710938,
"loss": 0.5991,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.016986923292279243,
"rewards/margins": 0.21991240978240967,
"rewards/margins_max": 0.29189637303352356,
"rewards/margins_min": 0.14792843163013458,
"rewards/margins_std": 0.10180072486400604,
"rewards/rejected": -0.23689934611320496,
"step": 240
},
{
"epoch": 0.15,
"grad_norm": 2.03125,
"learning_rate": 4.959580442249614e-07,
"logits/chosen": -3.5027713775634766,
"logits/rejected": -3.174872875213623,
"logps/chosen": -121.6041259765625,
"logps/rejected": -184.39622497558594,
"loss": 0.5971,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.03534800559282303,
"rewards/margins": 0.20791885256767273,
"rewards/margins_max": 0.3044799268245697,
"rewards/margins_min": 0.11135780811309814,
"rewards/margins_std": 0.13655796647071838,
"rewards/rejected": -0.24326686561107635,
"step": 250
},
{
"epoch": 0.16,
"grad_norm": 2.515625,
"learning_rate": 4.94954451379739e-07,
"logits/chosen": -3.4629738330841064,
"logits/rejected": -3.254920244216919,
"logps/chosen": -126.48948669433594,
"logps/rejected": -186.8356475830078,
"loss": 0.5893,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.0572446808218956,
"rewards/margins": 0.242076113820076,
"rewards/margins_max": 0.3453850746154785,
"rewards/margins_min": 0.13876716792583466,
"rewards/margins_std": 0.1461009383201599,
"rewards/rejected": -0.2993208169937134,
"step": 260
},
{
"epoch": 0.16,
"grad_norm": 2.296875,
"learning_rate": 4.938409360494883e-07,
"logits/chosen": -3.4049344062805176,
"logits/rejected": -3.1644232273101807,
"logps/chosen": -117.92335510253906,
"logps/rejected": -183.36587524414062,
"loss": 0.5894,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.053103696554899216,
"rewards/margins": 0.23206424713134766,
"rewards/margins_max": 0.3247820734977722,
"rewards/margins_min": 0.1393464058637619,
"rewards/margins_std": 0.13112285733222961,
"rewards/rejected": -0.28516796231269836,
"step": 270
},
{
"epoch": 0.17,
"grad_norm": 2.296875,
"learning_rate": 4.926179979204632e-07,
"logits/chosen": -3.4576289653778076,
"logits/rejected": -3.24690318107605,
"logps/chosen": -123.93232727050781,
"logps/rejected": -194.07188415527344,
"loss": 0.5881,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.0886482372879982,
"rewards/margins": 0.2548428773880005,
"rewards/margins_max": 0.3780335485935211,
"rewards/margins_min": 0.13165222108364105,
"rewards/margins_std": 0.17421790957450867,
"rewards/rejected": -0.3434911370277405,
"step": 280
},
{
"epoch": 0.18,
"grad_norm": 2.171875,
"learning_rate": 4.912861857820302e-07,
"logits/chosen": -3.3650596141815186,
"logits/rejected": -3.2302684783935547,
"logps/chosen": -111.41851806640625,
"logps/rejected": -206.7620849609375,
"loss": 0.5658,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.10818646103143692,
"rewards/margins": 0.2999250292778015,
"rewards/margins_max": 0.4337504506111145,
"rewards/margins_min": 0.1660996377468109,
"rewards/margins_std": 0.18925771117210388,
"rewards/rejected": -0.40811148285865784,
"step": 290
},
{
"epoch": 0.18,
"grad_norm": 2.8125,
"learning_rate": 4.898460972804008e-07,
"logits/chosen": -3.420971632003784,
"logits/rejected": -3.1563363075256348,
"logps/chosen": -122.5914077758789,
"logps/rejected": -199.3488006591797,
"loss": 0.549,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.08986975252628326,
"rewards/margins": 0.31944385170936584,
"rewards/margins_max": 0.4357198178768158,
"rewards/margins_min": 0.2031678408384323,
"rewards/margins_std": 0.16443908214569092,
"rewards/rejected": -0.4093135893344879,
"step": 300
},
{
"epoch": 0.19,
"grad_norm": 2.625,
"learning_rate": 4.882983786504399e-07,
"logits/chosen": -3.4148566722869873,
"logits/rejected": -3.1982669830322266,
"logps/chosen": -136.39987182617188,
"logps/rejected": -236.8019256591797,
"loss": 0.544,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.15588413178920746,
"rewards/margins": 0.3820186257362366,
"rewards/margins_max": 0.5593412518501282,
"rewards/margins_min": 0.20469605922698975,
"rewards/margins_std": 0.25077205896377563,
"rewards/rejected": -0.5379027724266052,
"step": 310
},
{
"epoch": 0.19,
"grad_norm": 2.421875,
"learning_rate": 4.866437244256695e-07,
"logits/chosen": -3.411226987838745,
"logits/rejected": -3.205670118331909,
"logps/chosen": -129.0833282470703,
"logps/rejected": -206.7379913330078,
"loss": 0.5469,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.14963629841804504,
"rewards/margins": 0.37762314081192017,
"rewards/margins_max": 0.5498967170715332,
"rewards/margins_min": 0.20534953474998474,
"rewards/margins_std": 0.24363164603710175,
"rewards/rejected": -0.5272594690322876,
"step": 320
},
{
"epoch": 0.2,
"grad_norm": 2.625,
"learning_rate": 4.848828771266001e-07,
"logits/chosen": -3.5033020973205566,
"logits/rejected": -3.273409366607666,
"logps/chosen": -156.30401611328125,
"logps/rejected": -197.61549377441406,
"loss": 0.5528,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.19874437153339386,
"rewards/margins": 0.3051304817199707,
"rewards/margins_max": 0.47203540802001953,
"rewards/margins_min": 0.13822560012340546,
"rewards/margins_std": 0.2360391616821289,
"rewards/rejected": -0.5038748979568481,
"step": 330
},
{
"epoch": 0.21,
"grad_norm": 2.703125,
"learning_rate": 4.830166269275266e-07,
"logits/chosen": -3.443110942840576,
"logits/rejected": -3.2243683338165283,
"logps/chosen": -148.33139038085938,
"logps/rejected": -215.1509246826172,
"loss": 0.5203,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.20774701237678528,
"rewards/margins": 0.386242538690567,
"rewards/margins_max": 0.5645895600318909,
"rewards/margins_min": 0.20789547264575958,
"rewards/margins_std": 0.2522208094596863,
"rewards/rejected": -0.5939895510673523,
"step": 340
},
{
"epoch": 0.21,
"grad_norm": 3.234375,
"learning_rate": 4.8104581130194e-07,
"logits/chosen": -3.4214928150177,
"logits/rejected": -3.2243239879608154,
"logps/chosen": -139.30186462402344,
"logps/rejected": -219.8242645263672,
"loss": 0.5001,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.22474750876426697,
"rewards/margins": 0.41614609956741333,
"rewards/margins_max": 0.6199635863304138,
"rewards/margins_min": 0.21232867240905762,
"rewards/margins_std": 0.2882413864135742,
"rewards/rejected": -0.6408936977386475,
"step": 350
},
{
"epoch": 0.22,
"grad_norm": 2.96875,
"learning_rate": 4.789713146467143e-07,
"logits/chosen": -3.401512861251831,
"logits/rejected": -3.190495252609253,
"logps/chosen": -146.44760131835938,
"logps/rejected": -218.2511444091797,
"loss": 0.5132,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2993583083152771,
"rewards/margins": 0.397621214389801,
"rewards/margins_max": 0.5621191263198853,
"rewards/margins_min": 0.2331233024597168,
"rewards/margins_std": 0.23263517022132874,
"rewards/rejected": -0.6969794631004333,
"step": 360
},
{
"epoch": 0.22,
"grad_norm": 2.734375,
"learning_rate": 4.767940678852368e-07,
"logits/chosen": -3.405986785888672,
"logits/rejected": -3.1692707538604736,
"logps/chosen": -153.48361206054688,
"logps/rejected": -218.69650268554688,
"loss": 0.5181,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.3115563988685608,
"rewards/margins": 0.4646086096763611,
"rewards/margins_max": 0.6958065032958984,
"rewards/margins_min": 0.23341062664985657,
"rewards/margins_std": 0.32696327567100525,
"rewards/rejected": -0.7761648893356323,
"step": 370
},
{
"epoch": 0.23,
"grad_norm": 3.171875,
"learning_rate": 4.7451504804965823e-07,
"logits/chosen": -3.3532516956329346,
"logits/rejected": -3.2124714851379395,
"logps/chosen": -138.7807159423828,
"logps/rejected": -285.9217834472656,
"loss": 0.4719,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.31872931122779846,
"rewards/margins": 0.5318517088890076,
"rewards/margins_max": 0.7545040249824524,
"rewards/margins_min": 0.30919957160949707,
"rewards/margins_std": 0.31487777829170227,
"rewards/rejected": -0.850581169128418,
"step": 380
},
{
"epoch": 0.24,
"grad_norm": 3.078125,
"learning_rate": 4.7213527784245395e-07,
"logits/chosen": -3.4123377799987793,
"logits/rejected": -3.1568984985351562,
"logps/chosen": -145.73069763183594,
"logps/rejected": -233.9403533935547,
"loss": 0.4702,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.35064345598220825,
"rewards/margins": 0.5569049715995789,
"rewards/margins_max": 0.7923904657363892,
"rewards/margins_min": 0.3214194178581238,
"rewards/margins_std": 0.33302679657936096,
"rewards/rejected": -0.9075484275817871,
"step": 390
},
{
"epoch": 0.24,
"grad_norm": 3.375,
"learning_rate": 4.6965582517748917e-07,
"logits/chosen": -3.3509891033172607,
"logits/rejected": -3.1365480422973633,
"logps/chosen": -157.23922729492188,
"logps/rejected": -269.90496826171875,
"loss": 0.4719,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.4497006833553314,
"rewards/margins": 0.6059035062789917,
"rewards/margins_max": 0.8811772465705872,
"rewards/margins_min": 0.33062973618507385,
"rewards/margins_std": 0.38929590582847595,
"rewards/rejected": -1.0556042194366455,
"step": 400
},
{
"epoch": 0.25,
"grad_norm": 3.46875,
"learning_rate": 4.6707780270079635e-07,
"logits/chosen": -3.4341049194335938,
"logits/rejected": -3.1959335803985596,
"logps/chosen": -172.27877807617188,
"logps/rejected": -291.41204833984375,
"loss": 0.4602,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.44683152437210083,
"rewards/margins": 0.6504033207893372,
"rewards/margins_max": 0.9344033002853394,
"rewards/margins_min": 0.36640337109565735,
"rewards/margins_std": 0.40163666009902954,
"rewards/rejected": -1.0972349643707275,
"step": 410
},
{
"epoch": 0.25,
"grad_norm": 4.0,
"learning_rate": 4.6440236729127876e-07,
"logits/chosen": -3.3718056678771973,
"logits/rejected": -3.1097371578216553,
"logps/chosen": -169.48446655273438,
"logps/rejected": -302.61016845703125,
"loss": 0.4337,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.5628658533096313,
"rewards/margins": 0.7150717973709106,
"rewards/margins_max": 1.017971396446228,
"rewards/margins_min": 0.4121721684932709,
"rewards/margins_std": 0.42836475372314453,
"rewards/rejected": -1.277937650680542,
"step": 420
},
{
"epoch": 0.26,
"grad_norm": 3.6875,
"learning_rate": 4.616307195415654e-07,
"logits/chosen": -3.3012547492980957,
"logits/rejected": -3.127958059310913,
"logps/chosen": -167.62088012695312,
"logps/rejected": -272.51800537109375,
"loss": 0.4286,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.5819985270500183,
"rewards/margins": 0.7929113507270813,
"rewards/margins_max": 1.1863847970962524,
"rewards/margins_min": 0.3994379937648773,
"rewards/margins_std": 0.5564553737640381,
"rewards/rejected": -1.3749098777770996,
"step": 430
},
{
"epoch": 0.27,
"grad_norm": 3.421875,
"learning_rate": 4.587641032192488e-07,
"logits/chosen": -3.333758592605591,
"logits/rejected": -3.151676654815674,
"logps/chosen": -181.68984985351562,
"logps/rejected": -296.03082275390625,
"loss": 0.3825,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.7118282914161682,
"rewards/margins": 0.7851115465164185,
"rewards/margins_max": 1.1220605373382568,
"rewards/margins_min": 0.4481624662876129,
"rewards/margins_std": 0.4765179753303528,
"rewards/rejected": -1.4969398975372314,
"step": 440
},
{
"epoch": 0.27,
"grad_norm": 4.71875,
"learning_rate": 4.558038047087486e-07,
"logits/chosen": -3.2601521015167236,
"logits/rejected": -3.0268001556396484,
"logps/chosen": -185.16275024414062,
"logps/rejected": -308.6865234375,
"loss": 0.3983,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7758798599243164,
"rewards/margins": 0.886875331401825,
"rewards/margins_max": 1.3197309970855713,
"rewards/margins_min": 0.454019695520401,
"rewards/margins_std": 0.6121503114700317,
"rewards/rejected": -1.6627552509307861,
"step": 450
},
{
"epoch": 0.28,
"grad_norm": 5.09375,
"learning_rate": 4.527511524340508e-07,
"logits/chosen": -3.229076862335205,
"logits/rejected": -3.024235486984253,
"logps/chosen": -197.80221557617188,
"logps/rejected": -347.35479736328125,
"loss": 0.3766,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8850440979003906,
"rewards/margins": 1.0945155620574951,
"rewards/margins_max": 1.7435153722763062,
"rewards/margins_min": 0.4455157220363617,
"rewards/margins_std": 0.9178244471549988,
"rewards/rejected": -1.9795596599578857,
"step": 460
},
{
"epoch": 0.29,
"grad_norm": 4.65625,
"learning_rate": 4.49607516262582e-07,
"logits/chosen": -3.2619071006774902,
"logits/rejected": -3.051602840423584,
"logps/chosen": -231.6276397705078,
"logps/rejected": -371.31866455078125,
"loss": 0.3899,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -1.0247620344161987,
"rewards/margins": 1.10407555103302,
"rewards/margins_max": 1.7106988430023193,
"rewards/margins_min": 0.49745243787765503,
"rewards/margins_std": 0.8578945994377136,
"rewards/rejected": -2.1288375854492188,
"step": 470
},
{
"epoch": 0.29,
"grad_norm": 3.90625,
"learning_rate": 4.4637430689048626e-07,
"logits/chosen": -3.2792510986328125,
"logits/rejected": -3.040688991546631,
"logps/chosen": -198.93948364257812,
"logps/rejected": -341.1888427734375,
"loss": 0.3362,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -1.007499098777771,
"rewards/margins": 1.0881038904190063,
"rewards/margins_max": 1.634576439857483,
"rewards/margins_min": 0.5416311025619507,
"rewards/margins_std": 0.772828996181488,
"rewards/rejected": -2.0956027507781982,
"step": 480
},
{
"epoch": 0.3,
"grad_norm": 5.34375,
"learning_rate": 4.4305297520957944e-07,
"logits/chosen": -3.227466106414795,
"logits/rejected": -3.0447263717651367,
"logps/chosen": -218.6245574951172,
"logps/rejected": -389.6742248535156,
"loss": 0.325,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.055724024772644,
"rewards/margins": 1.3826014995574951,
"rewards/margins_max": 1.864708662033081,
"rewards/margins_min": 0.9004942178726196,
"rewards/margins_std": 0.6818027496337891,
"rewards/rejected": -2.438325881958008,
"step": 490
},
{
"epoch": 0.3,
"grad_norm": 5.125,
"learning_rate": 4.396450116562669e-07,
"logits/chosen": -3.2189323902130127,
"logits/rejected": -3.0369110107421875,
"logps/chosen": -239.6689910888672,
"logps/rejected": -412.5196838378906,
"loss": 0.3732,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.3009732961654663,
"rewards/margins": 1.1667307615280151,
"rewards/margins_max": 1.9371881484985352,
"rewards/margins_min": 0.39627307653427124,
"rewards/margins_std": 1.0895916223526,
"rewards/rejected": -2.4677042961120605,
"step": 500
},
{
"epoch": 0.31,
"grad_norm": 9.625,
"learning_rate": 4.3615194554271483e-07,
"logits/chosen": -3.2492318153381348,
"logits/rejected": -3.042893886566162,
"logps/chosen": -261.33001708984375,
"logps/rejected": -447.9276428222656,
"loss": 0.3188,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.3674191236495972,
"rewards/margins": 1.4249876737594604,
"rewards/margins_max": 2.208099842071533,
"rewards/margins_min": 0.6418755054473877,
"rewards/margins_std": 1.1074877977371216,
"rewards/rejected": -2.7924067974090576,
"step": 510
},
{
"epoch": 0.32,
"grad_norm": 5.8125,
"learning_rate": 4.325753443705767e-07,
"logits/chosen": -3.217289686203003,
"logits/rejected": -3.0037760734558105,
"logps/chosen": -238.28817749023438,
"logps/rejected": -447.4773864746094,
"loss": 0.2919,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.272761344909668,
"rewards/margins": 1.5220317840576172,
"rewards/margins_max": 2.1593213081359863,
"rewards/margins_min": 0.8847425580024719,
"rewards/margins_std": 0.9012632369995117,
"rewards/rejected": -2.794793128967285,
"step": 520
},
{
"epoch": 0.32,
"grad_norm": 5.25,
"learning_rate": 4.289168131275822e-07,
"logits/chosen": -3.1981008052825928,
"logits/rejected": -2.9687576293945312,
"logps/chosen": -248.31906127929688,
"logps/rejected": -526.4910888671875,
"loss": 0.3017,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.4457924365997314,
"rewards/margins": 1.7948505878448486,
"rewards/margins_max": 2.742126226425171,
"rewards/margins_min": 0.8475747108459473,
"rewards/margins_std": 1.339650273323059,
"rewards/rejected": -3.240643262863159,
"step": 530
},
{
"epoch": 0.33,
"grad_norm": 7.125,
"learning_rate": 4.251779935673044e-07,
"logits/chosen": -3.1895217895507812,
"logits/rejected": -2.982194423675537,
"logps/chosen": -287.61956787109375,
"logps/rejected": -510.9176330566406,
"loss": 0.3085,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.6795003414154053,
"rewards/margins": 1.9267494678497314,
"rewards/margins_max": 2.750401735305786,
"rewards/margins_min": 1.1030967235565186,
"rewards/margins_std": 1.1648204326629639,
"rewards/rejected": -3.606250047683716,
"step": 540
},
{
"epoch": 0.33,
"grad_norm": 5.21875,
"learning_rate": 4.213605634724283e-07,
"logits/chosen": -3.2357590198516846,
"logits/rejected": -2.958969831466675,
"logps/chosen": -260.2586975097656,
"logps/rejected": -494.48944091796875,
"loss": 0.2896,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.4327127933502197,
"rewards/margins": 1.9400370121002197,
"rewards/margins_max": 3.0310521125793457,
"rewards/margins_min": 0.8490220308303833,
"rewards/margins_std": 1.5429283380508423,
"rewards/rejected": -3.3727500438690186,
"step": 550
},
{
"epoch": 0.34,
"grad_norm": 8.5625,
"learning_rate": 4.174662359018515e-07,
"logits/chosen": -3.204619884490967,
"logits/rejected": -2.958706855773926,
"logps/chosen": -267.38641357421875,
"logps/rejected": -474.48919677734375,
"loss": 0.3059,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.5095288753509521,
"rewards/margins": 1.707524299621582,
"rewards/margins_max": 2.5599889755249023,
"rewards/margins_min": 0.8550596237182617,
"rewards/margins_std": 1.2055673599243164,
"rewards/rejected": -3.217053174972534,
"step": 560
},
{
"epoch": 0.35,
"grad_norm": 8.125,
"learning_rate": 4.134967584219549e-07,
"logits/chosen": -3.152198314666748,
"logits/rejected": -2.9612772464752197,
"logps/chosen": -279.5748291015625,
"logps/rejected": -507.6549377441406,
"loss": 0.2771,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.6234207153320312,
"rewards/margins": 1.8984496593475342,
"rewards/margins_max": 2.928384780883789,
"rewards/margins_min": 0.8685151934623718,
"rewards/margins_std": 1.456547737121582,
"rewards/rejected": -3.5218708515167236,
"step": 570
},
{
"epoch": 0.35,
"grad_norm": 6.21875,
"learning_rate": 4.09453912322388e-07,
"logits/chosen": -3.1358606815338135,
"logits/rejected": -2.9630966186523438,
"logps/chosen": -279.37298583984375,
"logps/rejected": -529.50732421875,
"loss": 0.3097,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.6641263961791992,
"rewards/margins": 2.1226673126220703,
"rewards/margins_max": 3.121896266937256,
"rewards/margins_min": 1.1234381198883057,
"rewards/margins_std": 1.413123369216919,
"rewards/rejected": -3.7867934703826904,
"step": 580
},
{
"epoch": 0.36,
"grad_norm": 5.0625,
"learning_rate": 4.0533951181672137e-07,
"logits/chosen": -3.190006971359253,
"logits/rejected": -3.0020487308502197,
"logps/chosen": -259.03961181640625,
"logps/rejected": -524.3358154296875,
"loss": 0.2348,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -1.5405361652374268,
"rewards/margins": 1.977190613746643,
"rewards/margins_max": 2.800302028656006,
"rewards/margins_min": 1.1540789604187012,
"rewards/margins_std": 1.164055585861206,
"rewards/rejected": -3.5177268981933594,
"step": 590
},
{
"epoch": 0.36,
"grad_norm": 13.6875,
"learning_rate": 4.011554032283242e-07,
"logits/chosen": -3.20314359664917,
"logits/rejected": -2.951345682144165,
"logps/chosen": -268.9664001464844,
"logps/rejected": -507.90948486328125,
"loss": 0.2691,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.6380701065063477,
"rewards/margins": 2.148407459259033,
"rewards/margins_max": 3.0667290687561035,
"rewards/margins_min": 1.2300859689712524,
"rewards/margins_std": 1.2987029552459717,
"rewards/rejected": -3.7864773273468018,
"step": 600
},
{
"epoch": 0.37,
"grad_norm": 11.0,
"learning_rate": 3.9690346416183314e-07,
"logits/chosen": -3.1131813526153564,
"logits/rejected": -2.9457013607025146,
"logps/chosen": -289.3182373046875,
"logps/rejected": -539.81298828125,
"loss": 0.2696,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -1.7127681970596313,
"rewards/margins": 2.1862919330596924,
"rewards/margins_max": 3.286137104034424,
"rewards/margins_min": 1.08644700050354,
"rewards/margins_std": 1.5554157495498657,
"rewards/rejected": -3.8990604877471924,
"step": 610
},
{
"epoch": 0.38,
"grad_norm": 13.125,
"learning_rate": 3.9258560266058334e-07,
"logits/chosen": -3.1740329265594482,
"logits/rejected": -3.020383834838867,
"logps/chosen": -305.628173828125,
"logps/rejected": -612.4352416992188,
"loss": 0.2626,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.030916690826416,
"rewards/margins": 2.430964946746826,
"rewards/margins_max": 3.6694366931915283,
"rewards/margins_min": 1.1924933195114136,
"rewards/margins_std": 1.7514636516571045,
"rewards/rejected": -4.461881160736084,
"step": 620
},
{
"epoch": 0.38,
"grad_norm": 7.78125,
"learning_rate": 3.882037563503806e-07,
"logits/chosen": -3.1754307746887207,
"logits/rejected": -2.973268985748291,
"logps/chosen": -300.1325988769531,
"logps/rejected": -587.9697265625,
"loss": 0.2976,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.968949556350708,
"rewards/margins": 2.2382559776306152,
"rewards/margins_max": 3.457249402999878,
"rewards/margins_min": 1.0192627906799316,
"rewards/margins_std": 1.7239166498184204,
"rewards/rejected": -4.207205295562744,
"step": 630
},
{
"epoch": 0.39,
"grad_norm": 10.0625,
"learning_rate": 3.8375989156999803e-07,
"logits/chosen": -3.1942696571350098,
"logits/rejected": -3.031660556793213,
"logps/chosen": -277.3270263671875,
"logps/rejected": -628.7125244140625,
"loss": 0.2381,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.681290864944458,
"rewards/margins": 2.714406967163086,
"rewards/margins_max": 3.8859188556671143,
"rewards/margins_min": 1.5428953170776367,
"rewards/margins_std": 1.6567678451538086,
"rewards/rejected": -4.395698070526123,
"step": 640
},
{
"epoch": 0.39,
"grad_norm": 11.0,
"learning_rate": 3.7925600248878865e-07,
"logits/chosen": -3.0972537994384766,
"logits/rejected": -2.915043592453003,
"logps/chosen": -309.4454650878906,
"logps/rejected": -582.4273071289062,
"loss": 0.2835,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.8756641149520874,
"rewards/margins": 2.2718684673309326,
"rewards/margins_max": 3.4796149730682373,
"rewards/margins_min": 1.064121961593628,
"rewards/margins_std": 1.7080116271972656,
"rewards/rejected": -4.1475324630737305,
"step": 650
},
{
"epoch": 0.4,
"grad_norm": 8.625,
"learning_rate": 3.746941102118081e-07,
"logits/chosen": -3.1687800884246826,
"logits/rejected": -2.932328224182129,
"logps/chosen": -321.97003173828125,
"logps/rejected": -614.5501708984375,
"loss": 0.2362,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.9460325241088867,
"rewards/margins": 2.4259374141693115,
"rewards/margins_max": 3.647425413131714,
"rewards/margins_min": 1.2044496536254883,
"rewards/margins_std": 1.7274446487426758,
"rewards/rejected": -4.371970176696777,
"step": 660
},
{
"epoch": 0.41,
"grad_norm": 7.0625,
"learning_rate": 3.700762618728508e-07,
"logits/chosen": -3.105429172515869,
"logits/rejected": -2.8866400718688965,
"logps/chosen": -302.71575927734375,
"logps/rejected": -675.2667236328125,
"loss": 0.2554,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -1.801047682762146,
"rewards/margins": 2.991947650909424,
"rewards/margins_max": 4.486474990844727,
"rewards/margins_min": 1.4974205493927002,
"rewards/margins_std": 2.1135807037353516,
"rewards/rejected": -4.792995452880859,
"step": 670
},
{
"epoch": 0.41,
"grad_norm": 3.828125,
"learning_rate": 3.654045297158057e-07,
"logits/chosen": -3.164304256439209,
"logits/rejected": -2.9912238121032715,
"logps/chosen": -284.3798828125,
"logps/rejected": -547.4119873046875,
"loss": 0.2154,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.7268387079238892,
"rewards/margins": 2.4997963905334473,
"rewards/margins_max": 3.5198917388916016,
"rewards/margins_min": 1.4797013998031616,
"rewards/margins_std": 1.4426321983337402,
"rewards/rejected": -4.226634979248047,
"step": 680
},
{
"epoch": 0.42,
"grad_norm": 5.65625,
"learning_rate": 3.606810101647431e-07,
"logits/chosen": -3.19686222076416,
"logits/rejected": -2.9458096027374268,
"logps/chosen": -318.38238525390625,
"logps/rejected": -583.8440551757812,
"loss": 0.2514,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.952331304550171,
"rewards/margins": 2.3931100368499756,
"rewards/margins_max": 3.5836410522460938,
"rewards/margins_min": 1.2025787830352783,
"rewards/margins_std": 1.6836650371551514,
"rewards/rejected": -4.3454413414001465,
"step": 690
},
{
"epoch": 0.42,
"grad_norm": 8.25,
"learning_rate": 3.559078228831526e-07,
"logits/chosen": -3.1194119453430176,
"logits/rejected": -2.977457046508789,
"logps/chosen": -285.14794921875,
"logps/rejected": -596.258056640625,
"loss": 0.2487,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.8175990581512451,
"rewards/margins": 2.7383248805999756,
"rewards/margins_max": 3.9793529510498047,
"rewards/margins_min": 1.4972972869873047,
"rewards/margins_std": 1.7550785541534424,
"rewards/rejected": -4.555924415588379,
"step": 700
},
{
"epoch": 0.43,
"grad_norm": 8.625,
"learning_rate": 3.510871098227503e-07,
"logits/chosen": -3.2031445503234863,
"logits/rejected": -2.9235167503356934,
"logps/chosen": -335.8504638671875,
"logps/rejected": -606.4368896484375,
"loss": 0.2089,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.097665309906006,
"rewards/margins": 2.564664125442505,
"rewards/margins_max": 3.574242115020752,
"rewards/margins_min": 1.5550854206085205,
"rewards/margins_std": 1.4277592897415161,
"rewards/rejected": -4.662329196929932,
"step": 710
},
{
"epoch": 0.44,
"grad_norm": 12.5625,
"learning_rate": 3.462210342622853e-07,
"logits/chosen": -3.1175758838653564,
"logits/rejected": -2.900524616241455,
"logps/chosen": -316.82696533203125,
"logps/rejected": -691.4205932617188,
"loss": 0.2222,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.02522349357605,
"rewards/margins": 3.1580607891082764,
"rewards/margins_max": 4.554937839508057,
"rewards/margins_min": 1.761183500289917,
"rewards/margins_std": 1.9754825830459595,
"rewards/rejected": -5.183283805847168,
"step": 720
},
{
"epoch": 0.44,
"grad_norm": 6.53125,
"learning_rate": 3.4131177983677614e-07,
"logits/chosen": -3.1615021228790283,
"logits/rejected": -2.9676241874694824,
"logps/chosen": -309.53033447265625,
"logps/rejected": -637.4588623046875,
"loss": 0.3259,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.038564920425415,
"rewards/margins": 2.944957971572876,
"rewards/margins_max": 4.9082746505737305,
"rewards/margins_min": 0.9816409349441528,
"rewards/margins_std": 2.7765493392944336,
"rewards/rejected": -4.983522891998291,
"step": 730
},
{
"epoch": 0.45,
"grad_norm": 16.375,
"learning_rate": 3.363615495576114e-07,
"logits/chosen": -3.172344207763672,
"logits/rejected": -2.932992935180664,
"logps/chosen": -322.94537353515625,
"logps/rejected": -613.6011962890625,
"loss": 0.2776,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.8910267353057861,
"rewards/margins": 2.67472243309021,
"rewards/margins_max": 4.281624794006348,
"rewards/margins_min": 1.0678198337554932,
"rewards/margins_std": 2.272503614425659,
"rewards/rejected": -4.565749168395996,
"step": 740
},
{
"epoch": 0.45,
"grad_norm": 6.71875,
"learning_rate": 3.31372564823956e-07,
"logits/chosen": -3.1593432426452637,
"logits/rejected": -2.9213168621063232,
"logps/chosen": -300.8254089355469,
"logps/rejected": -554.28173828125,
"loss": 0.2749,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.9807758331298828,
"rewards/margins": 2.3717269897460938,
"rewards/margins_max": 3.653285503387451,
"rewards/margins_min": 1.0901682376861572,
"rewards/margins_std": 1.8123977184295654,
"rewards/rejected": -4.352502346038818,
"step": 750
},
{
"epoch": 0.46,
"grad_norm": 12.9375,
"learning_rate": 3.2634706442590585e-07,
"logits/chosen": -3.1167142391204834,
"logits/rejected": -2.943542718887329,
"logps/chosen": -320.70477294921875,
"logps/rejected": -626.4170532226562,
"loss": 0.2486,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.1256468296051025,
"rewards/margins": 2.6652743816375732,
"rewards/margins_max": 4.235686302185059,
"rewards/margins_min": 1.0948628187179565,
"rewards/margins_std": 2.220897674560547,
"rewards/rejected": -4.790921211242676,
"step": 760
},
{
"epoch": 0.47,
"grad_norm": 6.46875,
"learning_rate": 3.2128730353983824e-07,
"logits/chosen": -3.12074875831604,
"logits/rejected": -2.914388418197632,
"logps/chosen": -299.7171936035156,
"logps/rejected": -626.5369262695312,
"loss": 0.2256,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.9603103399276733,
"rewards/margins": 2.961648464202881,
"rewards/margins_max": 4.249537467956543,
"rewards/margins_min": 1.673760175704956,
"rewards/margins_std": 1.8213493824005127,
"rewards/rejected": -4.921958923339844,
"step": 770
},
{
"epoch": 0.47,
"grad_norm": 7.1875,
"learning_rate": 3.161955527164092e-07,
"logits/chosen": -3.1619656085968018,
"logits/rejected": -2.9911611080169678,
"logps/chosen": -314.0920104980469,
"logps/rejected": -611.2723999023438,
"loss": 0.2768,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0593760013580322,
"rewards/margins": 2.617433547973633,
"rewards/margins_max": 3.8496804237365723,
"rewards/margins_min": 1.3851864337921143,
"rewards/margins_std": 1.7426605224609375,
"rewards/rejected": -4.676808834075928,
"step": 780
},
{
"epoch": 0.48,
"grad_norm": 9.8125,
"learning_rate": 3.11074096861651e-07,
"logits/chosen": -3.1253132820129395,
"logits/rejected": -2.948439836502075,
"logps/chosen": -307.06842041015625,
"logps/rejected": -656.5684814453125,
"loss": 0.2793,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.9304516315460205,
"rewards/margins": 3.045482873916626,
"rewards/margins_max": 4.538393974304199,
"rewards/margins_min": 1.5525717735290527,
"rewards/margins_std": 2.111295223236084,
"rewards/rejected": -4.9759345054626465,
"step": 790
},
{
"epoch": 0.49,
"grad_norm": 7.9375,
"learning_rate": 3.0592523421162923e-07,
"logits/chosen": -3.14684796333313,
"logits/rejected": -2.9417788982391357,
"logps/chosen": -311.675048828125,
"logps/rejected": -673.3569946289062,
"loss": 0.2156,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.02833890914917,
"rewards/margins": 3.267068386077881,
"rewards/margins_max": 4.807781219482422,
"rewards/margins_min": 1.7263562679290771,
"rewards/margins_std": 2.178896427154541,
"rewards/rejected": -5.295407295227051,
"step": 800
},
{
"epoch": 0.49,
"grad_norm": 6.0,
"learning_rate": 3.0075127530111604e-07,
"logits/chosen": -3.143428325653076,
"logits/rejected": -2.8957009315490723,
"logps/chosen": -306.6546936035156,
"logps/rejected": -662.9694213867188,
"loss": 0.1952,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -1.7990596294403076,
"rewards/margins": 3.0265026092529297,
"rewards/margins_max": 4.350580215454102,
"rewards/margins_min": 1.7024250030517578,
"rewards/margins_std": 1.8725284337997437,
"rewards/rejected": -4.825562000274658,
"step": 810
},
{
"epoch": 0.5,
"grad_norm": 6.65625,
"learning_rate": 2.9555454192674635e-07,
"logits/chosen": -3.1340126991271973,
"logits/rejected": -2.944532871246338,
"logps/chosen": -296.8179016113281,
"logps/rejected": -654.5142822265625,
"loss": 0.205,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -1.8869205713272095,
"rewards/margins": 3.0365943908691406,
"rewards/margins_max": 4.509632110595703,
"rewards/margins_min": 1.563556432723999,
"rewards/margins_std": 2.0831899642944336,
"rewards/rejected": -4.9235148429870605,
"step": 820
},
{
"epoch": 0.5,
"grad_norm": 5.1875,
"learning_rate": 2.903373661051188e-07,
"logits/chosen": -3.221536636352539,
"logits/rejected": -3.0096938610076904,
"logps/chosen": -318.65667724609375,
"logps/rejected": -705.4375,
"loss": 0.1578,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.9611215591430664,
"rewards/margins": 3.1713013648986816,
"rewards/margins_max": 4.483790874481201,
"rewards/margins_min": 1.858811378479004,
"rewards/margins_std": 1.8561407327651978,
"rewards/rejected": -5.13242244720459,
"step": 830
},
{
"epoch": 0.51,
"grad_norm": 5.40625,
"learning_rate": 2.851020890263113e-07,
"logits/chosen": -3.156846046447754,
"logits/rejected": -2.902345657348633,
"logps/chosen": -333.4150085449219,
"logps/rejected": -682.9464111328125,
"loss": 0.2228,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0641589164733887,
"rewards/margins": 3.215531826019287,
"rewards/margins_max": 4.6603498458862305,
"rewards/margins_min": 1.7707140445709229,
"rewards/margins_std": 2.043280839920044,
"rewards/rejected": -5.279690742492676,
"step": 840
},
{
"epoch": 0.52,
"grad_norm": 8.75,
"learning_rate": 2.798510600032803e-07,
"logits/chosen": -3.1748039722442627,
"logits/rejected": -2.9051835536956787,
"logps/chosen": -336.9716796875,
"logps/rejected": -674.93212890625,
"loss": 0.2113,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.1552605628967285,
"rewards/margins": 3.1116385459899902,
"rewards/margins_max": 4.737573623657227,
"rewards/margins_min": 1.4857032299041748,
"rewards/margins_std": 2.29941987991333,
"rewards/rejected": -5.266899108886719,
"step": 850
},
{
"epoch": 0.52,
"grad_norm": 8.5625,
"learning_rate": 2.745866354176137e-07,
"logits/chosen": -3.108320713043213,
"logits/rejected": -2.8731348514556885,
"logps/chosen": -337.5401916503906,
"logps/rejected": -751.5953369140625,
"loss": 0.2018,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.291527509689331,
"rewards/margins": 3.7230491638183594,
"rewards/margins_max": 5.445635795593262,
"rewards/margins_min": 2.000462770462036,
"rewards/margins_std": 2.436105251312256,
"rewards/rejected": -6.014577388763428,
"step": 860
},
{
"epoch": 0.53,
"grad_norm": 6.84375,
"learning_rate": 2.693111776621136e-07,
"logits/chosen": -3.124844789505005,
"logits/rejected": -2.866428852081299,
"logps/chosen": -368.060302734375,
"logps/rejected": -777.5119018554688,
"loss": 0.2207,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.525402069091797,
"rewards/margins": 3.681495189666748,
"rewards/margins_max": 5.426387786865234,
"rewards/margins_min": 1.936603307723999,
"rewards/margins_std": 2.4676499366760254,
"rewards/rejected": -6.206897735595703,
"step": 870
},
{
"epoch": 0.53,
"grad_norm": 7.625,
"learning_rate": 2.640270540806793e-07,
"logits/chosen": -3.0661511421203613,
"logits/rejected": -2.8748667240142822,
"logps/chosen": -348.7222595214844,
"logps/rejected": -695.6060791015625,
"loss": 0.2213,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.3182373046875,
"rewards/margins": 3.1713945865631104,
"rewards/margins_max": 4.741438388824463,
"rewards/margins_min": 1.6013505458831787,
"rewards/margins_std": 2.2203774452209473,
"rewards/rejected": -5.489631652832031,
"step": 880
},
{
"epoch": 0.54,
"grad_norm": 5.75,
"learning_rate": 2.5873663590597063e-07,
"logits/chosen": -3.138188362121582,
"logits/rejected": -2.8532166481018066,
"logps/chosen": -332.19305419921875,
"logps/rejected": -696.7973022460938,
"loss": 0.2109,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0587775707244873,
"rewards/margins": 3.378685474395752,
"rewards/margins_max": 4.750607967376709,
"rewards/margins_min": 2.0067625045776367,
"rewards/margins_std": 1.9401918649673462,
"rewards/rejected": -5.43746280670166,
"step": 890
},
{
"epoch": 0.55,
"grad_norm": 21.875,
"learning_rate": 2.5344229719532484e-07,
"logits/chosen": -3.1494667530059814,
"logits/rejected": -2.9058408737182617,
"logps/chosen": -326.4560241699219,
"logps/rejected": -656.7224731445312,
"loss": 0.2174,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.120781421661377,
"rewards/margins": 3.120711088180542,
"rewards/margins_max": 4.613424777984619,
"rewards/margins_min": 1.627997636795044,
"rewards/margins_std": 2.111015796661377,
"rewards/rejected": -5.24149227142334,
"step": 900
},
{
"epoch": 0.55,
"grad_norm": 12.0,
"learning_rate": 2.481464137654068e-07,
"logits/chosen": -3.1502187252044678,
"logits/rejected": -2.901831865310669,
"logps/chosen": -336.9727783203125,
"logps/rejected": -758.7743530273438,
"loss": 0.2516,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.269904375076294,
"rewards/margins": 3.7340915203094482,
"rewards/margins_max": 5.41564416885376,
"rewards/margins_min": 2.052539110183716,
"rewards/margins_std": 2.3780744075775146,
"rewards/rejected": -6.003995895385742,
"step": 910
},
{
"epoch": 0.56,
"grad_norm": 8.875,
"learning_rate": 2.428513621260683e-07,
"logits/chosen": -3.167316436767578,
"logits/rejected": -2.951566457748413,
"logps/chosen": -350.9910583496094,
"logps/rejected": -678.45703125,
"loss": 0.2035,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.3870468139648438,
"rewards/margins": 3.059993267059326,
"rewards/margins_max": 4.496026039123535,
"rewards/margins_min": 1.6239604949951172,
"rewards/margins_std": 2.0308570861816406,
"rewards/rejected": -5.44704008102417,
"step": 920
},
{
"epoch": 0.56,
"grad_norm": 15.5625,
"learning_rate": 2.375595184138986e-07,
"logits/chosen": -3.1135799884796143,
"logits/rejected": -2.903838872909546,
"logps/chosen": -323.6688232421875,
"logps/rejected": -727.5667114257812,
"loss": 0.265,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.1313717365264893,
"rewards/margins": 3.3087127208709717,
"rewards/margins_max": 4.953644752502441,
"rewards/margins_min": 1.6637804508209229,
"rewards/margins_std": 2.3262856006622314,
"rewards/rejected": -5.440084934234619,
"step": 930
},
{
"epoch": 0.57,
"grad_norm": 17.375,
"learning_rate": 2.3227325732593993e-07,
"logits/chosen": -3.1387646198272705,
"logits/rejected": -2.882930278778076,
"logps/chosen": -320.67132568359375,
"logps/rejected": -736.220458984375,
"loss": 0.1934,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.9799764156341553,
"rewards/margins": 3.8522841930389404,
"rewards/margins_max": 5.505575180053711,
"rewards/margins_min": 2.198992967605591,
"rewards/margins_std": 2.338106870651245,
"rewards/rejected": -5.832260608673096,
"step": 940
},
{
"epoch": 0.58,
"grad_norm": 13.1875,
"learning_rate": 2.2699495105405114e-07,
"logits/chosen": -3.074521541595459,
"logits/rejected": -2.8952858448028564,
"logps/chosen": -324.089111328125,
"logps/rejected": -743.9403076171875,
"loss": 0.2139,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.197805881500244,
"rewards/margins": 3.5057437419891357,
"rewards/margins_max": 5.082973957061768,
"rewards/margins_min": 1.928513526916504,
"rewards/margins_std": 2.2305400371551514,
"rewards/rejected": -5.703549385070801,
"step": 950
},
{
"epoch": 0.58,
"grad_norm": 13.5,
"learning_rate": 2.217269682203937e-07,
"logits/chosen": -3.0950160026550293,
"logits/rejected": -2.8411691188812256,
"logps/chosen": -309.6170959472656,
"logps/rejected": -720.2401123046875,
"loss": 0.23,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.961046814918518,
"rewards/margins": 3.8151397705078125,
"rewards/margins_max": 5.517041206359863,
"rewards/margins_min": 2.113239049911499,
"rewards/margins_std": 2.406851291656494,
"rewards/rejected": -5.776186943054199,
"step": 960
},
{
"epoch": 0.59,
"grad_norm": 2.71875,
"learning_rate": 2.164716728145213e-07,
"logits/chosen": -3.1319289207458496,
"logits/rejected": -2.9833462238311768,
"logps/chosen": -353.3096618652344,
"logps/rejected": -787.9588623046875,
"loss": 0.1898,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.2338433265686035,
"rewards/margins": 3.6919732093811035,
"rewards/margins_max": 5.161503791809082,
"rewards/margins_min": 2.222442626953125,
"rewards/margins_std": 2.078230381011963,
"rewards/rejected": -5.925817012786865,
"step": 970
},
{
"epoch": 0.59,
"grad_norm": 7.09375,
"learning_rate": 2.1123142313254704e-07,
"logits/chosen": -3.119903087615967,
"logits/rejected": -2.9215176105499268,
"logps/chosen": -327.38165283203125,
"logps/rejected": -698.1976318359375,
"loss": 0.2065,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.0507330894470215,
"rewards/margins": 3.221263885498047,
"rewards/margins_max": 4.741235733032227,
"rewards/margins_min": 1.7012920379638672,
"rewards/margins_std": 2.1495652198791504,
"rewards/rejected": -5.271997451782227,
"step": 980
},
{
"epoch": 0.6,
"grad_norm": 10.9375,
"learning_rate": 2.0600857071886596e-07,
"logits/chosen": -3.111619234085083,
"logits/rejected": -2.886859655380249,
"logps/chosen": -348.9073181152344,
"logps/rejected": -710.0587768554688,
"loss": 0.2172,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.1807503700256348,
"rewards/margins": 3.2503695487976074,
"rewards/margins_max": 4.767851829528809,
"rewards/margins_min": 1.7328875064849854,
"rewards/margins_std": 2.1460437774658203,
"rewards/rejected": -5.431119441986084,
"step": 990
},
{
"epoch": 0.61,
"grad_norm": 9.625,
"learning_rate": 2.0080545931090784e-07,
"logits/chosen": -3.1535375118255615,
"logits/rejected": -2.965236186981201,
"logps/chosen": -344.1661071777344,
"logps/rejected": -811.3351440429688,
"loss": 0.2185,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.2291502952575684,
"rewards/margins": 4.049864768981934,
"rewards/margins_max": 5.990394592285156,
"rewards/margins_min": 2.1093358993530273,
"rewards/margins_std": 2.744323253631592,
"rewards/rejected": -6.27901554107666,
"step": 1000
},
{
"epoch": 0.61,
"grad_norm": 17.375,
"learning_rate": 1.9562442378739238e-07,
"logits/chosen": -3.125776767730713,
"logits/rejected": -2.9081075191497803,
"logps/chosen": -299.2303161621094,
"logps/rejected": -708.6509399414062,
"loss": 0.2322,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.9368846416473389,
"rewards/margins": 3.5816776752471924,
"rewards/margins_max": 5.060166835784912,
"rewards/margins_min": 2.1031877994537354,
"rewards/margins_std": 2.09089994430542,
"rewards/rejected": -5.518561840057373,
"step": 1010
},
{
"epoch": 0.62,
"grad_norm": 5.96875,
"learning_rate": 1.9046778912056043e-07,
"logits/chosen": -3.1317784786224365,
"logits/rejected": -2.9367737770080566,
"logps/chosen": -297.7860412597656,
"logps/rejected": -669.5913696289062,
"loss": 0.2341,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.9930238723754883,
"rewards/margins": 3.1364190578460693,
"rewards/margins_max": 4.621811866760254,
"rewards/margins_min": 1.6510257720947266,
"rewards/margins_std": 2.10066294670105,
"rewards/rejected": -5.129443168640137,
"step": 1020
},
{
"epoch": 0.62,
"grad_norm": 10.0625,
"learning_rate": 1.8533786933285106e-07,
"logits/chosen": -3.1418776512145996,
"logits/rejected": -2.9141292572021484,
"logps/chosen": -344.0506286621094,
"logps/rejected": -775.5567626953125,
"loss": 0.2484,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.0711874961853027,
"rewards/margins": 3.7657127380371094,
"rewards/margins_max": 5.59161901473999,
"rewards/margins_min": 1.9398069381713867,
"rewards/margins_std": 2.582221031188965,
"rewards/rejected": -5.836900234222412,
"step": 1030
},
{
"epoch": 0.63,
"grad_norm": 13.1875,
"learning_rate": 1.8023696645849063e-07,
"logits/chosen": -3.1590495109558105,
"logits/rejected": -2.9567525386810303,
"logps/chosen": -320.74176025390625,
"logps/rejected": -711.3518676757812,
"loss": 0.1803,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.144990921020508,
"rewards/margins": 3.530397891998291,
"rewards/margins_max": 5.047208309173584,
"rewards/margins_min": 2.0135867595672607,
"rewards/margins_std": 2.145094394683838,
"rewards/rejected": -5.675388336181641,
"step": 1040
},
{
"epoch": 0.64,
"grad_norm": 28.25,
"learning_rate": 1.7516736951046394e-07,
"logits/chosen": -3.1330792903900146,
"logits/rejected": -2.947277545928955,
"logps/chosen": -344.958251953125,
"logps/rejected": -704.8740234375,
"loss": 0.2298,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.3996665477752686,
"rewards/margins": 3.01820707321167,
"rewards/margins_max": 4.695876598358154,
"rewards/margins_min": 1.3405380249023438,
"rewards/margins_std": 2.3725826740264893,
"rewards/rejected": -5.417874336242676,
"step": 1050
},
{
"epoch": 0.64,
"grad_norm": 17.125,
"learning_rate": 1.7013135345332651e-07,
"logits/chosen": -3.1549530029296875,
"logits/rejected": -2.8647735118865967,
"logps/chosen": -316.10333251953125,
"logps/rejected": -794.4426879882812,
"loss": 0.2395,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.046635627746582,
"rewards/margins": 3.969254970550537,
"rewards/margins_max": 5.835241794586182,
"rewards/margins_min": 2.103269577026367,
"rewards/margins_std": 2.6389026641845703,
"rewards/rejected": -6.0158915519714355,
"step": 1060
},
{
"epoch": 0.65,
"grad_norm": 6.25,
"learning_rate": 1.6513117818232216e-07,
"logits/chosen": -3.1065542697906494,
"logits/rejected": -2.925407648086548,
"logps/chosen": -311.7358093261719,
"logps/rejected": -697.4700927734375,
"loss": 0.2107,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.1502268314361572,
"rewards/margins": 3.424119234085083,
"rewards/margins_max": 5.143943786621094,
"rewards/margins_min": 1.7042953968048096,
"rewards/margins_std": 2.4321987628936768,
"rewards/rejected": -5.574346542358398,
"step": 1070
},
{
"epoch": 0.66,
"grad_norm": 7.1875,
"learning_rate": 1.6016908750926284e-07,
"logits/chosen": -3.199125289916992,
"logits/rejected": -2.9049952030181885,
"logps/chosen": -323.9697265625,
"logps/rejected": -733.3807373046875,
"loss": 0.2562,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.1301119327545166,
"rewards/margins": 3.5133304595947266,
"rewards/margins_max": 5.211588382720947,
"rewards/margins_min": 1.8150726556777954,
"rewards/margins_std": 2.4016995429992676,
"rewards/rejected": -5.643442630767822,
"step": 1080
},
{
"epoch": 0.66,
"grad_norm": 2.03125,
"learning_rate": 1.5524730815562517e-07,
"logits/chosen": -3.093618631362915,
"logits/rejected": -2.912240743637085,
"logps/chosen": -318.77142333984375,
"logps/rejected": -740.55126953125,
"loss": 0.1697,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.103801727294922,
"rewards/margins": 3.8205482959747314,
"rewards/margins_max": 5.348752975463867,
"rewards/margins_min": 2.2923431396484375,
"rewards/margins_std": 2.161208152770996,
"rewards/rejected": -5.924350261688232,
"step": 1090
},
{
"epoch": 0.67,
"grad_norm": 3.8125,
"learning_rate": 1.5036804875331733e-07,
"logits/chosen": -3.138913869857788,
"logits/rejected": -2.938734292984009,
"logps/chosen": -347.51422119140625,
"logps/rejected": -744.4155883789062,
"loss": 0.2236,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.3483636379241943,
"rewards/margins": 3.551708936691284,
"rewards/margins_max": 5.2547502517700195,
"rewards/margins_min": 1.8486677408218384,
"rewards/margins_std": 2.408463954925537,
"rewards/rejected": -5.900073051452637,
"step": 1100
},
{
"epoch": 0.67,
"grad_norm": 7.65625,
"learning_rate": 1.455334988535621e-07,
"logits/chosen": -3.1878058910369873,
"logits/rejected": -2.8721015453338623,
"logps/chosen": -320.9921875,
"logps/rejected": -697.1282958984375,
"loss": 0.2256,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.09895658493042,
"rewards/margins": 3.339977264404297,
"rewards/margins_max": 4.8780035972595215,
"rewards/margins_min": 1.8019511699676514,
"rewards/margins_std": 2.1750974655151367,
"rewards/rejected": -5.438933849334717,
"step": 1110
},
{
"epoch": 0.68,
"grad_norm": 10.9375,
"learning_rate": 1.4074582794434387e-07,
"logits/chosen": -3.1153368949890137,
"logits/rejected": -2.913144826889038,
"logps/chosen": -312.5743103027344,
"logps/rejected": -745.4815673828125,
"loss": 0.1932,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.063194751739502,
"rewards/margins": 3.72094988822937,
"rewards/margins_max": 5.494509696960449,
"rewards/margins_min": 1.9473907947540283,
"rewards/margins_std": 2.5081920623779297,
"rewards/rejected": -5.784144878387451,
"step": 1120
},
{
"epoch": 0.69,
"grad_norm": 10.0,
"learning_rate": 1.36007184476858e-07,
"logits/chosen": -3.132167339324951,
"logits/rejected": -2.922752857208252,
"logps/chosen": -326.2919921875,
"logps/rejected": -773.3209228515625,
"loss": 0.1876,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1564865112304688,
"rewards/margins": 3.9871773719787598,
"rewards/margins_max": 6.1475114822387695,
"rewards/margins_min": 1.8268429040908813,
"rewards/margins_std": 3.0551745891571045,
"rewards/rejected": -6.143664360046387,
"step": 1130
},
{
"epoch": 0.69,
"grad_norm": 17.5,
"learning_rate": 1.313196949014001e-07,
"logits/chosen": -3.1706137657165527,
"logits/rejected": -2.879790782928467,
"logps/chosen": -335.986328125,
"logps/rejected": -716.2806396484375,
"loss": 0.251,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.1077635288238525,
"rewards/margins": 3.5597450733184814,
"rewards/margins_max": 5.525472164154053,
"rewards/margins_min": 1.5940181016921997,
"rewards/margins_std": 2.7799577713012695,
"rewards/rejected": -5.667508125305176,
"step": 1140
},
{
"epoch": 0.7,
"grad_norm": 22.0,
"learning_rate": 1.266854627131295e-07,
"logits/chosen": -3.135575771331787,
"logits/rejected": -2.9847023487091064,
"logps/chosen": -302.3170471191406,
"logps/rejected": -696.525390625,
"loss": 0.2497,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.0112688541412354,
"rewards/margins": 3.4218602180480957,
"rewards/margins_max": 4.963052749633789,
"rewards/margins_min": 1.8806670904159546,
"rewards/margins_std": 2.1795761585235596,
"rewards/rejected": -5.433128833770752,
"step": 1150
},
{
"epoch": 0.7,
"grad_norm": 7.84375,
"learning_rate": 1.2210656750813203e-07,
"logits/chosen": -3.0703582763671875,
"logits/rejected": -2.871122121810913,
"logps/chosen": -362.7223205566406,
"logps/rejected": -770.84912109375,
"loss": 0.2566,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.4078450202941895,
"rewards/margins": 3.343618392944336,
"rewards/margins_max": 5.4634623527526855,
"rewards/margins_min": 1.2237741947174072,
"rewards/margins_std": 2.9979124069213867,
"rewards/rejected": -5.751462936401367,
"step": 1160
},
{
"epoch": 0.71,
"grad_norm": 8.0,
"learning_rate": 1.1758506405020885e-07,
"logits/chosen": -3.186342716217041,
"logits/rejected": -2.8949544429779053,
"logps/chosen": -336.4505920410156,
"logps/rejected": -697.22412109375,
"loss": 0.1831,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.248119592666626,
"rewards/margins": 3.425248384475708,
"rewards/margins_max": 5.014912128448486,
"rewards/margins_min": 1.8355858325958252,
"rewards/margins_std": 2.2481231689453125,
"rewards/rejected": -5.673368453979492,
"step": 1170
},
{
"epoch": 0.72,
"grad_norm": 7.78125,
"learning_rate": 1.1312298134880799e-07,
"logits/chosen": -3.218524217605591,
"logits/rejected": -2.952272891998291,
"logps/chosen": -343.3241882324219,
"logps/rejected": -660.6552734375,
"loss": 0.2477,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.2785308361053467,
"rewards/margins": 2.914747714996338,
"rewards/margins_max": 4.317625999450684,
"rewards/margins_min": 1.511869192123413,
"rewards/margins_std": 1.983970046043396,
"rewards/rejected": -5.193279266357422,
"step": 1180
},
{
"epoch": 0.72,
"grad_norm": 5.40625,
"learning_rate": 1.0872232174851281e-07,
"logits/chosen": -3.1693577766418457,
"logits/rejected": -2.916531562805176,
"logps/chosen": -357.85003662109375,
"logps/rejected": -758.5368041992188,
"loss": 0.2052,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -2.358463764190674,
"rewards/margins": 3.578223705291748,
"rewards/margins_max": 5.278543472290039,
"rewards/margins_min": 1.8779041767120361,
"rewards/margins_std": 2.4046151638031006,
"rewards/rejected": -5.936688423156738,
"step": 1190
},
{
"epoch": 0.73,
"grad_norm": 4.6875,
"learning_rate": 1.0438506003049735e-07,
"logits/chosen": -3.1248936653137207,
"logits/rejected": -2.8823959827423096,
"logps/chosen": -327.49066162109375,
"logps/rejected": -708.4923095703125,
"loss": 0.2082,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1310365200042725,
"rewards/margins": 3.5814547538757324,
"rewards/margins_max": 5.441410064697266,
"rewards/margins_min": 1.7214996814727783,
"rewards/margins_std": 2.630373954772949,
"rewards/rejected": -5.712491035461426,
"step": 1200
},
{
"epoch": 0.73,
"grad_norm": 17.125,
"learning_rate": 1.0011314252634908e-07,
"logits/chosen": -3.1236038208007812,
"logits/rejected": -2.9256346225738525,
"logps/chosen": -326.3320617675781,
"logps/rejected": -626.0516357421875,
"loss": 0.2082,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.1023125648498535,
"rewards/margins": 2.722761631011963,
"rewards/margins_max": 3.6524386405944824,
"rewards/margins_min": 1.793083906173706,
"rewards/margins_std": 1.3147621154785156,
"rewards/rejected": -4.825074195861816,
"step": 1210
},
{
"epoch": 0.74,
"grad_norm": 8.25,
"learning_rate": 9.590848624465989e-08,
"logits/chosen": -3.152843475341797,
"logits/rejected": -2.9420909881591797,
"logps/chosen": -330.8764343261719,
"logps/rejected": -701.1426391601562,
"loss": 0.2014,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0373191833496094,
"rewards/margins": 3.338742733001709,
"rewards/margins_max": 4.737041473388672,
"rewards/margins_min": 1.9404443502426147,
"rewards/margins_std": 1.9774929285049438,
"rewards/rejected": -5.376061916351318,
"step": 1220
},
{
"epoch": 0.75,
"grad_norm": 6.78125,
"learning_rate": 9.17729780107746e-08,
"logits/chosen": -3.130746603012085,
"logits/rejected": -2.9680044651031494,
"logps/chosen": -304.0748596191406,
"logps/rejected": -809.8206787109375,
"loss": 0.1795,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.9301007986068726,
"rewards/margins": 4.500051021575928,
"rewards/margins_max": 6.72606897354126,
"rewards/margins_min": 2.274033308029175,
"rewards/margins_std": 3.1480648517608643,
"rewards/rejected": -6.430152893066406,
"step": 1230
},
{
"epoch": 0.75,
"grad_norm": 8.8125,
"learning_rate": 8.770847362008426e-08,
"logits/chosen": -3.1518819332122803,
"logits/rejected": -2.929631233215332,
"logps/chosen": -314.208251953125,
"logps/rejected": -736.5687255859375,
"loss": 0.1798,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.0510923862457275,
"rewards/margins": 3.5633749961853027,
"rewards/margins_max": 5.141615867614746,
"rewards/margins_min": 1.9851341247558594,
"rewards/margins_std": 2.2319698333740234,
"rewards/rejected": -5.614466667175293,
"step": 1240
},
{
"epoch": 0.76,
"grad_norm": 12.0625,
"learning_rate": 8.371679700524476e-08,
"logits/chosen": -3.0984597206115723,
"logits/rejected": -2.8767600059509277,
"logps/chosen": -379.36767578125,
"logps/rejected": -761.9058837890625,
"loss": 0.3184,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.6515004634857178,
"rewards/margins": 3.444476366043091,
"rewards/margins_max": 5.3620100021362305,
"rewards/margins_min": 1.526942491531372,
"rewards/margins_std": 2.7118022441864014,
"rewards/rejected": -6.095976829528809,
"step": 1250
},
{
"epoch": 0.76,
"grad_norm": 8.5,
"learning_rate": 7.979973941769255e-08,
"logits/chosen": -3.072702646255493,
"logits/rejected": -2.9324452877044678,
"logps/chosen": -315.5204162597656,
"logps/rejected": -710.9278564453125,
"loss": 0.1895,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1958985328674316,
"rewards/margins": 3.221574068069458,
"rewards/margins_max": 4.918590068817139,
"rewards/margins_min": 1.5245568752288818,
"rewards/margins_std": 2.399944305419922,
"rewards/rejected": -5.417471885681152,
"step": 1260
},
{
"epoch": 0.77,
"grad_norm": 8.9375,
"learning_rate": 7.595905862382704e-08,
"logits/chosen": -3.1583075523376465,
"logits/rejected": -2.929879665374756,
"logps/chosen": -333.9653015136719,
"logps/rejected": -782.7625732421875,
"loss": 0.1715,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -2.2348623275756836,
"rewards/margins": 3.9564566612243652,
"rewards/margins_max": 5.616944313049316,
"rewards/margins_min": 2.295968532562256,
"rewards/margins_std": 2.3482847213745117,
"rewards/rejected": -6.191318988800049,
"step": 1270
},
{
"epoch": 0.78,
"grad_norm": 8.5,
"learning_rate": 7.219647811621874e-08,
"logits/chosen": -3.1165127754211426,
"logits/rejected": -3.0133025646209717,
"logps/chosen": -289.7486877441406,
"logps/rejected": -685.6094970703125,
"loss": 0.2154,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.0258007049560547,
"rewards/margins": 3.255826234817505,
"rewards/margins_max": 4.866274833679199,
"rewards/margins_min": 1.6453778743743896,
"rewards/margins_std": 2.277517795562744,
"rewards/rejected": -5.2816267013549805,
"step": 1280
},
{
"epoch": 0.78,
"grad_norm": 5.8125,
"learning_rate": 6.851368634019777e-08,
"logits/chosen": -3.133932590484619,
"logits/rejected": -2.8580222129821777,
"logps/chosen": -356.24627685546875,
"logps/rejected": -722.9967041015625,
"loss": 0.2697,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.307548999786377,
"rewards/margins": 3.4757111072540283,
"rewards/margins_max": 5.241189479827881,
"rewards/margins_min": 1.7102329730987549,
"rewards/margins_std": 2.4967634677886963,
"rewards/rejected": -5.783260822296143,
"step": 1290
},
{
"epoch": 0.79,
"grad_norm": 9.6875,
"learning_rate": 6.491233593616971e-08,
"logits/chosen": -3.173882484436035,
"logits/rejected": -2.9467577934265137,
"logps/chosen": -351.42913818359375,
"logps/rejected": -698.7190551757812,
"loss": 0.2373,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.2619590759277344,
"rewards/margins": 3.2130227088928223,
"rewards/margins_max": 4.786438941955566,
"rewards/margins_min": 1.6396061182022095,
"rewards/margins_std": 2.225146770477295,
"rewards/rejected": -5.474982261657715,
"step": 1300
},
{
"epoch": 0.79,
"grad_norm": 5.59375,
"learning_rate": 6.139404299799863e-08,
"logits/chosen": -3.14751935005188,
"logits/rejected": -2.9100093841552734,
"logps/chosen": -293.3255920410156,
"logps/rejected": -721.6419677734375,
"loss": 0.2033,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.9109737873077393,
"rewards/margins": 4.052936553955078,
"rewards/margins_max": 6.207828044891357,
"rewards/margins_min": 1.8980449438095093,
"rewards/margins_std": 3.0474772453308105,
"rewards/rejected": -5.963910102844238,
"step": 1310
},
{
"epoch": 0.8,
"grad_norm": 6.5625,
"learning_rate": 5.796038634779057e-08,
"logits/chosen": -3.1226589679718018,
"logits/rejected": -2.879516363143921,
"logps/chosen": -336.1898498535156,
"logps/rejected": -733.499267578125,
"loss": 0.2087,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.25373911857605,
"rewards/margins": 3.4867053031921387,
"rewards/margins_max": 5.092923164367676,
"rewards/margins_min": 1.8804876804351807,
"rewards/margins_std": 2.2715346813201904,
"rewards/rejected": -5.740444660186768,
"step": 1320
},
{
"epoch": 0.81,
"grad_norm": 9.875,
"learning_rate": 5.4612906827402466e-08,
"logits/chosen": -3.1812329292297363,
"logits/rejected": -2.9600119590759277,
"logps/chosen": -335.65435791015625,
"logps/rejected": -723.2822265625,
"loss": 0.2063,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -2.1229610443115234,
"rewards/margins": 3.5574288368225098,
"rewards/margins_max": 4.972989082336426,
"rewards/margins_min": 2.141868829727173,
"rewards/margins_std": 2.001904249191284,
"rewards/rejected": -5.680389404296875,
"step": 1330
},
{
"epoch": 0.81,
"grad_norm": 15.75,
"learning_rate": 5.1353106606994514e-08,
"logits/chosen": -3.166288375854492,
"logits/rejected": -2.9235687255859375,
"logps/chosen": -365.612060546875,
"logps/rejected": -702.6682739257812,
"loss": 0.2482,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.405470371246338,
"rewards/margins": 3.1533291339874268,
"rewards/margins_max": 5.086661338806152,
"rewards/margins_min": 1.2199971675872803,
"rewards/margins_std": 2.7341442108154297,
"rewards/rejected": -5.558799743652344,
"step": 1340
},
{
"epoch": 0.82,
"grad_norm": 7.75,
"learning_rate": 4.818244851093642e-08,
"logits/chosen": -3.1529836654663086,
"logits/rejected": -2.902099609375,
"logps/chosen": -335.6700439453125,
"logps/rejected": -789.0332641601562,
"loss": 0.183,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0865893363952637,
"rewards/margins": 3.7543671131134033,
"rewards/margins_max": 5.634464740753174,
"rewards/margins_min": 1.874269723892212,
"rewards/margins_std": 2.6588597297668457,
"rewards/rejected": -5.840956211090088,
"step": 1350
},
{
"epoch": 0.82,
"grad_norm": 7.1875,
"learning_rate": 4.5102355361369607e-08,
"logits/chosen": -3.128056764602661,
"logits/rejected": -2.8609917163848877,
"logps/chosen": -304.3298034667969,
"logps/rejected": -660.6272583007812,
"loss": 0.1713,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.9923508167266846,
"rewards/margins": 3.305687665939331,
"rewards/margins_max": 4.834566593170166,
"rewards/margins_min": 1.7768090963363647,
"rewards/margins_std": 2.162160873413086,
"rewards/rejected": -5.298038959503174,
"step": 1360
},
{
"epoch": 0.83,
"grad_norm": 14.0,
"learning_rate": 4.21142093397209e-08,
"logits/chosen": -3.1291439533233643,
"logits/rejected": -2.9123919010162354,
"logps/chosen": -331.235107421875,
"logps/rejected": -673.1749877929688,
"loss": 0.2041,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.2359235286712646,
"rewards/margins": 3.1155121326446533,
"rewards/margins_max": 4.511401176452637,
"rewards/margins_min": 1.7196223735809326,
"rewards/margins_std": 1.974085807800293,
"rewards/rejected": -5.351435661315918,
"step": 1370
},
{
"epoch": 0.84,
"grad_norm": 5.25,
"learning_rate": 3.921935136645327e-08,
"logits/chosen": -3.1134796142578125,
"logits/rejected": -2.8933892250061035,
"logps/chosen": -319.20086669921875,
"logps/rejected": -797.2918701171875,
"loss": 0.1232,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -2.06329607963562,
"rewards/margins": 4.198972225189209,
"rewards/margins_max": 6.017431259155273,
"rewards/margins_min": 2.3805129528045654,
"rewards/margins_std": 2.5716898441314697,
"rewards/rejected": -6.26226806640625,
"step": 1380
},
{
"epoch": 0.84,
"grad_norm": 4.4375,
"learning_rate": 3.6419080499331986e-08,
"logits/chosen": -3.1291165351867676,
"logits/rejected": -2.9171788692474365,
"logps/chosen": -314.25543212890625,
"logps/rejected": -668.8634643554688,
"loss": 0.2378,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.037966251373291,
"rewards/margins": 3.3993752002716064,
"rewards/margins_max": 4.708580017089844,
"rewards/margins_min": 2.090170383453369,
"rewards/margins_std": 1.8514951467514038,
"rewards/rejected": -5.437341213226318,
"step": 1390
},
{
"epoch": 0.85,
"grad_norm": 3.8125,
"learning_rate": 3.371465335047713e-08,
"logits/chosen": -3.1593098640441895,
"logits/rejected": -2.941859722137451,
"logps/chosen": -320.30194091796875,
"logps/rejected": -844.5147705078125,
"loss": 0.1994,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -2.0999464988708496,
"rewards/margins": 4.604376316070557,
"rewards/margins_max": 6.516266822814941,
"rewards/margins_min": 2.6924843788146973,
"rewards/margins_std": 2.7038230895996094,
"rewards/rejected": -6.70432186126709,
"step": 1400
},
{
"epoch": 0.86,
"grad_norm": 8.1875,
"learning_rate": 3.110728352246311e-08,
"logits/chosen": -3.1501519680023193,
"logits/rejected": -2.896111011505127,
"logps/chosen": -315.76397705078125,
"logps/rejected": -660.5819702148438,
"loss": 0.2063,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.091414213180542,
"rewards/margins": 3.229884624481201,
"rewards/margins_max": 4.599147796630859,
"rewards/margins_min": 1.8606210947036743,
"rewards/margins_std": 1.9364306926727295,
"rewards/rejected": -5.321299076080322,
"step": 1410
},
{
"epoch": 0.86,
"grad_norm": 4.0,
"learning_rate": 2.8598141063718217e-08,
"logits/chosen": -3.1871862411499023,
"logits/rejected": -2.935353994369507,
"logps/chosen": -326.63946533203125,
"logps/rejected": -735.0399780273438,
"loss": 0.1791,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.211333990097046,
"rewards/margins": 3.781127452850342,
"rewards/margins_max": 5.652678489685059,
"rewards/margins_min": 1.9095767736434937,
"rewards/margins_std": 2.646772861480713,
"rewards/rejected": -5.992461681365967,
"step": 1420
},
{
"epoch": 0.87,
"grad_norm": 7.8125,
"learning_rate": 2.6188351943469966e-08,
"logits/chosen": -3.1684048175811768,
"logits/rejected": -2.9018070697784424,
"logps/chosen": -379.37646484375,
"logps/rejected": -710.8772583007812,
"loss": 0.2164,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.436944007873535,
"rewards/margins": 3.187828540802002,
"rewards/margins_max": 4.914183139801025,
"rewards/margins_min": 1.461474061012268,
"rewards/margins_std": 2.4414334297180176,
"rewards/rejected": -5.624772071838379,
"step": 1430
},
{
"epoch": 0.87,
"grad_norm": 3.984375,
"learning_rate": 2.3878997546469577e-08,
"logits/chosen": -3.173720121383667,
"logits/rejected": -2.9183566570281982,
"logps/chosen": -352.3770446777344,
"logps/rejected": -733.684814453125,
"loss": 0.1905,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2058091163635254,
"rewards/margins": 3.5168144702911377,
"rewards/margins_max": 4.97069787979126,
"rewards/margins_min": 2.062930107116699,
"rewards/margins_std": 2.056102752685547,
"rewards/rejected": -5.722623348236084,
"step": 1440
},
{
"epoch": 0.88,
"grad_norm": 8.3125,
"learning_rate": 2.1671114187724603e-08,
"logits/chosen": -3.183567523956299,
"logits/rejected": -2.9641623497009277,
"logps/chosen": -312.87066650390625,
"logps/rejected": -769.3661499023438,
"loss": 0.2043,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.0123302936553955,
"rewards/margins": 4.006140232086182,
"rewards/margins_max": 5.548596382141113,
"rewards/margins_min": 2.4636826515197754,
"rewards/margins_std": 2.181363582611084,
"rewards/rejected": -6.018470287322998,
"step": 1450
},
{
"epoch": 0.89,
"grad_norm": 6.65625,
"learning_rate": 1.9565692647456e-08,
"logits/chosen": -3.114928722381592,
"logits/rejected": -2.891458749771118,
"logps/chosen": -323.8619384765625,
"logps/rejected": -699.9288940429688,
"loss": 0.2067,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1312808990478516,
"rewards/margins": 3.579425096511841,
"rewards/margins_max": 5.033545017242432,
"rewards/margins_min": 2.125305414199829,
"rewards/margins_std": 2.0564355850219727,
"rewards/rejected": -5.710705757141113,
"step": 1460
},
{
"epoch": 0.89,
"grad_norm": 21.625,
"learning_rate": 1.7563677726488645e-08,
"logits/chosen": -3.192821502685547,
"logits/rejected": -2.9360315799713135,
"logps/chosen": -325.11724853515625,
"logps/rejected": -732.0574340820312,
"loss": 0.1636,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.197850465774536,
"rewards/margins": 3.762909412384033,
"rewards/margins_max": 5.297985553741455,
"rewards/margins_min": 2.2278337478637695,
"rewards/margins_std": 2.170924663543701,
"rewards/rejected": -5.96075963973999,
"step": 1470
},
{
"epoch": 0.9,
"grad_norm": 16.625,
"learning_rate": 1.5665967822275417e-08,
"logits/chosen": -3.124788999557495,
"logits/rejected": -2.9426681995391846,
"logps/chosen": -326.2402648925781,
"logps/rejected": -790.9951782226562,
"loss": 0.2929,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.1549830436706543,
"rewards/margins": 3.9306769371032715,
"rewards/margins_max": 5.507823944091797,
"rewards/margins_min": 2.353529691696167,
"rewards/margins_std": 2.2304234504699707,
"rewards/rejected": -6.085659980773926,
"step": 1480
},
{
"epoch": 0.9,
"grad_norm": 6.34375,
"learning_rate": 1.3873414525744115e-08,
"logits/chosen": -3.157076120376587,
"logits/rejected": -2.8868565559387207,
"logps/chosen": -340.90924072265625,
"logps/rejected": -718.08740234375,
"loss": 0.1802,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0659420490264893,
"rewards/margins": 3.228773593902588,
"rewards/margins_max": 4.96280574798584,
"rewards/margins_min": 1.4947407245635986,
"rewards/margins_std": 2.4522926807403564,
"rewards/rejected": -5.29471492767334,
"step": 1490
},
{
"epoch": 0.91,
"grad_norm": 7.59375,
"learning_rate": 1.2186822239149158e-08,
"logits/chosen": -3.1129629611968994,
"logits/rejected": -2.8585550785064697,
"logps/chosen": -352.6842346191406,
"logps/rejected": -787.0264892578125,
"loss": 0.2004,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.368502616882324,
"rewards/margins": 3.9364609718322754,
"rewards/margins_max": 5.6117143630981445,
"rewards/margins_min": 2.2612078189849854,
"rewards/margins_std": 2.369166135787964,
"rewards/rejected": -6.304963111877441,
"step": 1500
},
{
"epoch": 0.92,
"grad_norm": 9.875,
"learning_rate": 1.0606947815098467e-08,
"logits/chosen": -3.1238672733306885,
"logits/rejected": -2.8686344623565674,
"logps/chosen": -307.52020263671875,
"logps/rejected": -777.89306640625,
"loss": 0.217,
"rewards/accuracies": 0.9375,
"rewards/chosen": -1.9263112545013428,
"rewards/margins": 4.194777488708496,
"rewards/margins_max": 6.070175647735596,
"rewards/margins_min": 2.3193793296813965,
"rewards/margins_std": 2.6522135734558105,
"rewards/rejected": -6.121088981628418,
"step": 1510
},
{
"epoch": 0.92,
"grad_norm": 37.0,
"learning_rate": 9.134500216918722e-09,
"logits/chosen": -3.1342949867248535,
"logits/rejected": -2.919426441192627,
"logps/chosen": -335.2420959472656,
"logps/rejected": -697.2754516601562,
"loss": 0.2387,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.338940143585205,
"rewards/margins": 3.315152645111084,
"rewards/margins_max": 4.826213836669922,
"rewards/margins_min": 1.8040918111801147,
"rewards/margins_std": 2.136962652206421,
"rewards/rejected": -5.654092311859131,
"step": 1520
},
{
"epoch": 0.93,
"grad_norm": 4.0625,
"learning_rate": 7.770140200510338e-09,
"logits/chosen": -3.099794626235962,
"logits/rejected": -2.881441593170166,
"logps/chosen": -366.3176574707031,
"logps/rejected": -856.1891479492188,
"loss": 0.1464,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.4581868648529053,
"rewards/margins": 4.456292152404785,
"rewards/margins_max": 6.471911430358887,
"rewards/margins_min": 2.440671682357788,
"rewards/margins_std": 2.8505172729492188,
"rewards/rejected": -6.9144792556762695,
"step": 1530
},
{
"epoch": 0.93,
"grad_norm": 18.75,
"learning_rate": 6.5144800178352776e-09,
"logits/chosen": -3.127145290374756,
"logits/rejected": -2.9245965480804443,
"logps/chosen": -357.00323486328125,
"logps/rejected": -728.6698608398438,
"loss": 0.2302,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.4451308250427246,
"rewards/margins": 3.3258328437805176,
"rewards/margins_max": 4.757521152496338,
"rewards/margins_min": 1.894144058227539,
"rewards/margins_std": 2.0247135162353516,
"rewards/rejected": -5.7709641456604,
"step": 1540
},
{
"epoch": 0.94,
"grad_norm": 8.3125,
"learning_rate": 5.368083142171409e-09,
"logits/chosen": -3.1004772186279297,
"logits/rejected": -2.87990403175354,
"logps/chosen": -353.71478271484375,
"logps/rejected": -771.7055053710938,
"loss": 0.2079,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.3986122608184814,
"rewards/margins": 3.5793299674987793,
"rewards/margins_max": 5.495848178863525,
"rewards/margins_min": 1.6628128290176392,
"rewards/margins_std": 2.710364580154419,
"rewards/rejected": -5.97794246673584,
"step": 1550
},
{
"epoch": 0.95,
"grad_norm": 5.78125,
"learning_rate": 4.331464015255526e-09,
"logits/chosen": -3.149285078048706,
"logits/rejected": -2.8654189109802246,
"logps/chosen": -333.75341796875,
"logps/rejected": -872.8679809570312,
"loss": 0.2164,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -2.111713409423828,
"rewards/margins": 4.595676422119141,
"rewards/margins_max": 6.780759334564209,
"rewards/margins_min": 2.410592555999756,
"rewards/margins_std": 3.090175151824951,
"rewards/rejected": -6.707389831542969,
"step": 1560
},
{
"epoch": 0.95,
"grad_norm": 15.375,
"learning_rate": 3.4050878164293695e-09,
"logits/chosen": -3.1380608081817627,
"logits/rejected": -2.8875515460968018,
"logps/chosen": -356.3845520019531,
"logps/rejected": -684.7333984375,
"loss": 0.2406,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.311772108078003,
"rewards/margins": 2.945913553237915,
"rewards/margins_max": 4.341352462768555,
"rewards/margins_min": 1.5504741668701172,
"rewards/margins_std": 1.9734489917755127,
"rewards/rejected": -5.257685661315918,
"step": 1570
},
{
"epoch": 0.96,
"grad_norm": 7.375,
"learning_rate": 2.5893702538920537e-09,
"logits/chosen": -3.112511396408081,
"logits/rejected": -2.9152560234069824,
"logps/chosen": -335.6734924316406,
"logps/rejected": -702.3567504882812,
"loss": 0.2425,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -2.2349658012390137,
"rewards/margins": 3.2430100440979004,
"rewards/margins_max": 4.741458892822266,
"rewards/margins_min": 1.7445614337921143,
"rewards/margins_std": 2.119126081466675,
"rewards/rejected": -5.477975845336914,
"step": 1580
},
{
"epoch": 0.96,
"grad_norm": 7.65625,
"learning_rate": 1.884677378152372e-09,
"logits/chosen": -3.1265830993652344,
"logits/rejected": -2.887659788131714,
"logps/chosen": -346.2403869628906,
"logps/rejected": -709.3991088867188,
"loss": 0.2288,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.28578782081604,
"rewards/margins": 3.251765727996826,
"rewards/margins_max": 5.136622428894043,
"rewards/margins_min": 1.3669096231460571,
"rewards/margins_std": 2.6655895709991455,
"rewards/rejected": -5.5375542640686035,
"step": 1590
},
{
"epoch": 0.97,
"grad_norm": 7.9375,
"learning_rate": 1.2913254177648325e-09,
"logits/chosen": -3.1185574531555176,
"logits/rejected": -2.887942314147949,
"logps/chosen": -304.56109619140625,
"logps/rejected": -669.3530883789062,
"loss": 0.2095,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -2.0278687477111816,
"rewards/margins": 3.3642592430114746,
"rewards/margins_max": 5.217440128326416,
"rewards/margins_min": 1.511077880859375,
"rewards/margins_std": 2.6207940578460693,
"rewards/rejected": -5.39212703704834,
"step": 1600
},
{
"epoch": 0.98,
"grad_norm": 7.28125,
"learning_rate": 8.095806374232295e-10,
"logits/chosen": -3.1527209281921387,
"logits/rejected": -2.837277889251709,
"logps/chosen": -360.6504211425781,
"logps/rejected": -759.2733154296875,
"loss": 0.2634,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -2.213390827178955,
"rewards/margins": 3.6566321849823,
"rewards/margins_max": 5.329567909240723,
"rewards/margins_min": 1.9836972951889038,
"rewards/margins_std": 2.3658881187438965,
"rewards/rejected": -5.870023250579834,
"step": 1610
},
{
"epoch": 0.98,
"grad_norm": 15.4375,
"learning_rate": 4.3965921847513576e-10,
"logits/chosen": -3.155099391937256,
"logits/rejected": -2.9255661964416504,
"logps/chosen": -323.4555358886719,
"logps/rejected": -777.19677734375,
"loss": 0.1381,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.105621337890625,
"rewards/margins": 4.154127597808838,
"rewards/margins_max": 5.744351387023926,
"rewards/margins_min": 2.563903570175171,
"rewards/margins_std": 2.2489163875579834,
"rewards/rejected": -6.259748935699463,
"step": 1620
},
{
"epoch": 0.99,
"grad_norm": 5.40625,
"learning_rate": 1.8172716191142134e-10,
"logits/chosen": -3.1000237464904785,
"logits/rejected": -2.900533676147461,
"logps/chosen": -336.09130859375,
"logps/rejected": -730.9088745117188,
"loss": 0.2025,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.1460416316986084,
"rewards/margins": 3.563811779022217,
"rewards/margins_max": 5.688532829284668,
"rewards/margins_min": 1.4390910863876343,
"rewards/margins_std": 3.0048089027404785,
"rewards/rejected": -5.709853172302246,
"step": 1630
},
{
"epoch": 0.99,
"grad_norm": 8.8125,
"learning_rate": 3.59002138737019e-11,
"logits/chosen": -3.079613208770752,
"logits/rejected": -2.889169931411743,
"logps/chosen": -328.8927917480469,
"logps/rejected": -725.4284057617188,
"loss": 0.1653,
"rewards/accuracies": 0.9375,
"rewards/chosen": -2.0904576778411865,
"rewards/margins": 3.3967292308807373,
"rewards/margins_max": 5.103400230407715,
"rewards/margins_min": 1.6900584697723389,
"rewards/margins_std": 2.413597583770752,
"rewards/rejected": -5.487187385559082,
"step": 1640
},
{
"epoch": 1.0,
"eval_logits/chosen": -2.2152950763702393,
"eval_logits/rejected": -2.1189112663269043,
"eval_logps/chosen": -352.6255798339844,
"eval_logps/rejected": -347.4267272949219,
"eval_loss": 0.6940016150474548,
"eval_rewards/accuracies": 0.5379999876022339,
"eval_rewards/chosen": -0.7885112166404724,
"eval_rewards/margins": 0.058900706470012665,
"eval_rewards/margins_max": 0.7288501262664795,
"eval_rewards/margins_min": -0.5646029114723206,
"eval_rewards/margins_std": 0.42255058884620667,
"eval_rewards/rejected": -0.8474118709564209,
"eval_runtime": 884.8339,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 0.283,
"step": 1648
},
{
"epoch": 1.0,
"step": 1648,
"total_flos": 0.0,
"train_loss": 0.33046212517520757,
"train_runtime": 17297.4072,
"train_samples_per_second": 1.525,
"train_steps_per_second": 0.095
}
],
"logging_steps": 10,
"max_steps": 1648,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}