gemma-2-27b-it-SimPO-37K / trainer_state.json
AALF's picture
Upload trainer_state.json
d8b395a verified
raw
history blame
156 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987085665088248,
"eval_steps": 500,
"global_step": 290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0034438226431338786,
"grad_norm": 52.674065050138765,
"learning_rate": 2.7586206896551723e-08,
"logits/chosen": -5.615417003631592,
"logits/rejected": -5.667238712310791,
"logps/chosen": -0.4943414330482483,
"logps/rejected": -0.6143913865089417,
"loss": 5.1079,
"rewards/accuracies": 0.5,
"rewards/chosen": -4.943414688110352,
"rewards/margins": 1.200499415397644,
"rewards/rejected": -6.143914222717285,
"step": 1
},
{
"epoch": 0.006887645286267757,
"grad_norm": 60.60351785955272,
"learning_rate": 5.517241379310345e-08,
"logits/chosen": -5.4836554527282715,
"logits/rejected": -5.671990871429443,
"logps/chosen": -0.5493791103363037,
"logps/rejected": -0.6132436990737915,
"loss": 5.0887,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.493791103363037,
"rewards/margins": 0.638646125793457,
"rewards/rejected": -6.132437705993652,
"step": 2
},
{
"epoch": 0.010331467929401636,
"grad_norm": 59.59005871499247,
"learning_rate": 8.275862068965517e-08,
"logits/chosen": -5.246090412139893,
"logits/rejected": -5.381677627563477,
"logps/chosen": -0.45825690031051636,
"logps/rejected": -0.531363308429718,
"loss": 5.0369,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.582569122314453,
"rewards/margins": 0.7310642004013062,
"rewards/rejected": -5.313632965087891,
"step": 3
},
{
"epoch": 0.013775290572535515,
"grad_norm": 58.937392298416974,
"learning_rate": 1.103448275862069e-07,
"logits/chosen": -5.279723167419434,
"logits/rejected": -5.392016410827637,
"logps/chosen": -0.5628327131271362,
"logps/rejected": -0.5180907249450684,
"loss": 5.3846,
"rewards/accuracies": 0.3125,
"rewards/chosen": -5.628327369689941,
"rewards/margins": -0.4474201798439026,
"rewards/rejected": -5.180907249450684,
"step": 4
},
{
"epoch": 0.017219113215669393,
"grad_norm": 51.34542849508367,
"learning_rate": 1.3793103448275863e-07,
"logits/chosen": -5.257406234741211,
"logits/rejected": -5.315362453460693,
"logps/chosen": -0.525193989276886,
"logps/rejected": -0.572905421257019,
"loss": 5.1008,
"rewards/accuracies": 0.5625,
"rewards/chosen": -5.25193977355957,
"rewards/margins": 0.47711431980133057,
"rewards/rejected": -5.729053974151611,
"step": 5
},
{
"epoch": 0.020662935858803272,
"grad_norm": 62.72154095060763,
"learning_rate": 1.6551724137931034e-07,
"logits/chosen": -5.621905326843262,
"logits/rejected": -5.849234104156494,
"logps/chosen": -0.5514707565307617,
"logps/rejected": -0.5989887118339539,
"loss": 5.0616,
"rewards/accuracies": 0.4375,
"rewards/chosen": -5.514707565307617,
"rewards/margins": 0.47517985105514526,
"rewards/rejected": -5.989887237548828,
"step": 6
},
{
"epoch": 0.02410675850193715,
"grad_norm": 58.76872623763212,
"learning_rate": 1.9310344827586208e-07,
"logits/chosen": -5.585313320159912,
"logits/rejected": -5.405150413513184,
"logps/chosen": -0.6232742667198181,
"logps/rejected": -0.5102998614311218,
"loss": 5.2864,
"rewards/accuracies": 0.6875,
"rewards/chosen": -6.232743263244629,
"rewards/margins": -1.129744529724121,
"rewards/rejected": -5.102999210357666,
"step": 7
},
{
"epoch": 0.02755058114507103,
"grad_norm": 49.99034574764161,
"learning_rate": 2.206896551724138e-07,
"logits/chosen": -5.209949016571045,
"logits/rejected": -5.195341110229492,
"logps/chosen": -0.47157737612724304,
"logps/rejected": -0.47744542360305786,
"loss": 5.1666,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.715773582458496,
"rewards/margins": 0.05868096649646759,
"rewards/rejected": -4.7744550704956055,
"step": 8
},
{
"epoch": 0.030994403788204908,
"grad_norm": 58.66744740284535,
"learning_rate": 2.482758620689655e-07,
"logits/chosen": -5.36690616607666,
"logits/rejected": -5.485147953033447,
"logps/chosen": -0.5631701946258545,
"logps/rejected": -0.5826945304870605,
"loss": 5.1211,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.631701946258545,
"rewards/margins": 0.1952425241470337,
"rewards/rejected": -5.826944828033447,
"step": 9
},
{
"epoch": 0.034438226431338786,
"grad_norm": 46.491425410700735,
"learning_rate": 2.7586206896551726e-07,
"logits/chosen": -5.605222225189209,
"logits/rejected": -5.74063777923584,
"logps/chosen": -0.501873791217804,
"logps/rejected": -0.5232819318771362,
"loss": 5.1411,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.018738269805908,
"rewards/margins": 0.21408089995384216,
"rewards/rejected": -5.232819080352783,
"step": 10
},
{
"epoch": 0.037882049074472665,
"grad_norm": 83.27704023131166,
"learning_rate": 3.034482758620689e-07,
"logits/chosen": -5.455163955688477,
"logits/rejected": -5.336368560791016,
"logps/chosen": -0.7997897267341614,
"logps/rejected": -0.5844107270240784,
"loss": 5.6532,
"rewards/accuracies": 0.5,
"rewards/chosen": -7.997897624969482,
"rewards/margins": -2.1537904739379883,
"rewards/rejected": -5.844107151031494,
"step": 11
},
{
"epoch": 0.041325871717606544,
"grad_norm": 48.93901907364537,
"learning_rate": 3.310344827586207e-07,
"logits/chosen": -5.099009990692139,
"logits/rejected": -5.078139781951904,
"logps/chosen": -0.4147089719772339,
"logps/rejected": -0.4047776460647583,
"loss": 5.1796,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.147089958190918,
"rewards/margins": -0.09931322932243347,
"rewards/rejected": -4.047776699066162,
"step": 12
},
{
"epoch": 0.04476969436074042,
"grad_norm": 50.65745487462828,
"learning_rate": 3.586206896551724e-07,
"logits/chosen": -4.989038467407227,
"logits/rejected": -4.911070346832275,
"logps/chosen": -0.5246780514717102,
"logps/rejected": -0.47285518050193787,
"loss": 5.0677,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.2467803955078125,
"rewards/margins": -0.5182281732559204,
"rewards/rejected": -4.728552341461182,
"step": 13
},
{
"epoch": 0.0482135170038743,
"grad_norm": 57.66601162283009,
"learning_rate": 3.8620689655172415e-07,
"logits/chosen": -5.70846700668335,
"logits/rejected": -5.75710916519165,
"logps/chosen": -0.44634702801704407,
"logps/rejected": -0.4534481465816498,
"loss": 5.1503,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.463469982147217,
"rewards/margins": 0.07101157307624817,
"rewards/rejected": -4.534482002258301,
"step": 14
},
{
"epoch": 0.05165733964700818,
"grad_norm": 64.39971082160424,
"learning_rate": 4.1379310344827586e-07,
"logits/chosen": -5.157317161560059,
"logits/rejected": -5.051291465759277,
"logps/chosen": -0.5117197632789612,
"logps/rejected": -0.6016834378242493,
"loss": 5.1127,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.117197036743164,
"rewards/margins": 0.8996371626853943,
"rewards/rejected": -6.016834735870361,
"step": 15
},
{
"epoch": 0.05510116229014206,
"grad_norm": 43.28657458222468,
"learning_rate": 4.413793103448276e-07,
"logits/chosen": -5.220727920532227,
"logits/rejected": -5.298764228820801,
"logps/chosen": -0.4690421521663666,
"logps/rejected": -0.4100668132305145,
"loss": 4.9565,
"rewards/accuracies": 0.375,
"rewards/chosen": -4.6904215812683105,
"rewards/margins": -0.5897536873817444,
"rewards/rejected": -4.100667953491211,
"step": 16
},
{
"epoch": 0.05854498493327594,
"grad_norm": 55.940864947672644,
"learning_rate": 4.6896551724137923e-07,
"logits/chosen": -5.025401592254639,
"logits/rejected": -5.0607781410217285,
"logps/chosen": -0.6796688437461853,
"logps/rejected": -0.6925962567329407,
"loss": 5.0153,
"rewards/accuracies": 0.5,
"rewards/chosen": -6.796688079833984,
"rewards/margins": 0.12927353382110596,
"rewards/rejected": -6.925961971282959,
"step": 17
},
{
"epoch": 0.061988807576409816,
"grad_norm": 60.73894736133069,
"learning_rate": 4.96551724137931e-07,
"logits/chosen": -5.200845718383789,
"logits/rejected": -5.258395195007324,
"logps/chosen": -0.38808485865592957,
"logps/rejected": -0.38210412859916687,
"loss": 5.0964,
"rewards/accuracies": 0.375,
"rewards/chosen": -3.8808484077453613,
"rewards/margins": -0.059806786477565765,
"rewards/rejected": -3.8210418224334717,
"step": 18
},
{
"epoch": 0.0654326302195437,
"grad_norm": 50.35855568678846,
"learning_rate": 5.241379310344828e-07,
"logits/chosen": -5.284404277801514,
"logits/rejected": -5.353991508483887,
"logps/chosen": -0.4545897841453552,
"logps/rejected": -0.4399701952934265,
"loss": 5.0991,
"rewards/accuracies": 0.3125,
"rewards/chosen": -4.545897483825684,
"rewards/margins": -0.1461959034204483,
"rewards/rejected": -4.399702548980713,
"step": 19
},
{
"epoch": 0.06887645286267757,
"grad_norm": 103.40309567567355,
"learning_rate": 5.517241379310345e-07,
"logits/chosen": -5.317500114440918,
"logits/rejected": -5.262624740600586,
"logps/chosen": -0.4840647280216217,
"logps/rejected": -0.6067878007888794,
"loss": 5.1123,
"rewards/accuracies": 0.5625,
"rewards/chosen": -4.8406476974487305,
"rewards/margins": 1.2272305488586426,
"rewards/rejected": -6.067877769470215,
"step": 20
},
{
"epoch": 0.07232027550581145,
"grad_norm": 46.89431893563974,
"learning_rate": 5.793103448275862e-07,
"logits/chosen": -5.2541890144348145,
"logits/rejected": -5.130758285522461,
"logps/chosen": -0.4697263538837433,
"logps/rejected": -0.5909743905067444,
"loss": 4.937,
"rewards/accuracies": 0.5625,
"rewards/chosen": -4.697263240814209,
"rewards/margins": 1.2124805450439453,
"rewards/rejected": -5.9097442626953125,
"step": 21
},
{
"epoch": 0.07576409814894533,
"grad_norm": 67.56806825458494,
"learning_rate": 6.068965517241378e-07,
"logits/chosen": -4.811267375946045,
"logits/rejected": -4.888763904571533,
"logps/chosen": -0.4097307324409485,
"logps/rejected": -0.40606409311294556,
"loss": 5.1578,
"rewards/accuracies": 0.375,
"rewards/chosen": -4.097307205200195,
"rewards/margins": -0.03666616231203079,
"rewards/rejected": -4.060640811920166,
"step": 22
},
{
"epoch": 0.07920792079207921,
"grad_norm": 61.29631937996971,
"learning_rate": 6.344827586206897e-07,
"logits/chosen": -4.587668418884277,
"logits/rejected": -4.847754001617432,
"logps/chosen": -0.5758047103881836,
"logps/rejected": -0.5185554027557373,
"loss": 4.8982,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.758047580718994,
"rewards/margins": -0.5724934339523315,
"rewards/rejected": -5.185554027557373,
"step": 23
},
{
"epoch": 0.08265174343521309,
"grad_norm": 93.2152404636039,
"learning_rate": 6.620689655172414e-07,
"logits/chosen": -4.800999641418457,
"logits/rejected": -5.004166603088379,
"logps/chosen": -0.3709278702735901,
"logps/rejected": -0.4135555624961853,
"loss": 4.8412,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.7092788219451904,
"rewards/margins": 0.42627638578414917,
"rewards/rejected": -4.135554790496826,
"step": 24
},
{
"epoch": 0.08609556607834697,
"grad_norm": 64.42035341096593,
"learning_rate": 6.89655172413793e-07,
"logits/chosen": -4.344979286193848,
"logits/rejected": -4.520305633544922,
"logps/chosen": -0.46768859028816223,
"logps/rejected": -0.4844363033771515,
"loss": 4.9329,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.676885604858398,
"rewards/margins": 0.16747775673866272,
"rewards/rejected": -4.844363212585449,
"step": 25
},
{
"epoch": 0.08953938872148084,
"grad_norm": 56.68263958973131,
"learning_rate": 7.172413793103448e-07,
"logits/chosen": -4.4868011474609375,
"logits/rejected": -4.451172828674316,
"logps/chosen": -0.3706013262271881,
"logps/rejected": -0.44255292415618896,
"loss": 4.8804,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.7060132026672363,
"rewards/margins": 0.7195163369178772,
"rewards/rejected": -4.425529479980469,
"step": 26
},
{
"epoch": 0.09298321136461472,
"grad_norm": 69.68637190790966,
"learning_rate": 7.448275862068965e-07,
"logits/chosen": -4.358417510986328,
"logits/rejected": -4.435417652130127,
"logps/chosen": -0.3981940746307373,
"logps/rejected": -0.4577309489250183,
"loss": 4.8387,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.981940984725952,
"rewards/margins": 0.5953686237335205,
"rewards/rejected": -4.5773091316223145,
"step": 27
},
{
"epoch": 0.0964270340077486,
"grad_norm": 58.3412204024468,
"learning_rate": 7.724137931034483e-07,
"logits/chosen": -4.649521350860596,
"logits/rejected": -4.8989362716674805,
"logps/chosen": -0.33310776948928833,
"logps/rejected": -0.37322184443473816,
"loss": 4.6174,
"rewards/accuracies": 0.625,
"rewards/chosen": -3.331077814102173,
"rewards/margins": 0.4011409282684326,
"rewards/rejected": -3.7322187423706055,
"step": 28
},
{
"epoch": 0.09987085665088248,
"grad_norm": 56.195572626431165,
"learning_rate": 8e-07,
"logits/chosen": -4.673098087310791,
"logits/rejected": -5.0018768310546875,
"logps/chosen": -0.4161800742149353,
"logps/rejected": -0.3885071277618408,
"loss": 4.65,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.161801338195801,
"rewards/margins": -0.27672961354255676,
"rewards/rejected": -3.885071277618408,
"step": 29
},
{
"epoch": 0.10331467929401636,
"grad_norm": 67.80702382842614,
"learning_rate": 7.999710236630706e-07,
"logits/chosen": -4.643288612365723,
"logits/rejected": -4.589477062225342,
"logps/chosen": -0.4303164482116699,
"logps/rejected": -0.506043016910553,
"loss": 4.8244,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.303164482116699,
"rewards/margins": 0.7572658658027649,
"rewards/rejected": -5.06043004989624,
"step": 30
},
{
"epoch": 0.10675850193715024,
"grad_norm": 50.50628925100566,
"learning_rate": 7.998840988504232e-07,
"logits/chosen": -4.767556190490723,
"logits/rejected": -4.7690935134887695,
"logps/chosen": -0.403850257396698,
"logps/rejected": -0.44447407126426697,
"loss": 4.7897,
"rewards/accuracies": 0.5625,
"rewards/chosen": -4.0385026931762695,
"rewards/margins": 0.4062381088733673,
"rewards/rejected": -4.4447407722473145,
"step": 31
},
{
"epoch": 0.11020232458028412,
"grad_norm": 60.05314418873607,
"learning_rate": 7.997392381558708e-07,
"logits/chosen": -3.7635271549224854,
"logits/rejected": -3.760200262069702,
"logps/chosen": -0.5402004718780518,
"logps/rejected": -0.5654389262199402,
"loss": 4.7483,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.402004718780518,
"rewards/margins": 0.25238436460494995,
"rewards/rejected": -5.654389381408691,
"step": 32
},
{
"epoch": 0.113646147223418,
"grad_norm": 50.090026374394135,
"learning_rate": 7.99536462567075e-07,
"logits/chosen": -5.203555583953857,
"logits/rejected": -5.3314290046691895,
"logps/chosen": -0.4754854440689087,
"logps/rejected": -0.4819332957267761,
"loss": 4.8249,
"rewards/accuracies": 0.375,
"rewards/chosen": -4.754855155944824,
"rewards/margins": 0.06447845697402954,
"rewards/rejected": -4.819333076477051,
"step": 33
},
{
"epoch": 0.11708996986655187,
"grad_norm": 44.62210441192165,
"learning_rate": 7.992758014625048e-07,
"logits/chosen": -4.730749607086182,
"logits/rejected": -4.70894718170166,
"logps/chosen": -0.3653126657009125,
"logps/rejected": -0.47924527525901794,
"loss": 4.681,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.6531267166137695,
"rewards/margins": 1.1393256187438965,
"rewards/rejected": -4.792451858520508,
"step": 34
},
{
"epoch": 0.12053379250968575,
"grad_norm": 51.36875532303172,
"learning_rate": 7.989572926071799e-07,
"logits/chosen": -4.721662521362305,
"logits/rejected": -4.724156856536865,
"logps/chosen": -0.4223301410675049,
"logps/rejected": -0.4952259063720703,
"loss": 4.5665,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.223300933837891,
"rewards/margins": 0.7289580702781677,
"rewards/rejected": -4.952259063720703,
"step": 35
},
{
"epoch": 0.12397761515281963,
"grad_norm": 57.9863349338661,
"learning_rate": 7.985809821472e-07,
"logits/chosen": -4.691116809844971,
"logits/rejected": -4.813366413116455,
"logps/chosen": -0.4277626872062683,
"logps/rejected": -0.4881032705307007,
"loss": 4.6191,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.277626991271973,
"rewards/margins": 0.6034059524536133,
"rewards/rejected": -4.881032943725586,
"step": 36
},
{
"epoch": 0.1274214377959535,
"grad_norm": 63.01073545994464,
"learning_rate": 7.981469246030587e-07,
"logits/chosen": -4.308718204498291,
"logits/rejected": -4.413212776184082,
"logps/chosen": -0.4789758026599884,
"logps/rejected": -0.58740234375,
"loss": 4.7889,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.789758205413818,
"rewards/margins": 1.0842654705047607,
"rewards/rejected": -5.8740234375,
"step": 37
},
{
"epoch": 0.1308652604390874,
"grad_norm": 83.60559787534415,
"learning_rate": 7.976551828617438e-07,
"logits/chosen": -4.922616481781006,
"logits/rejected": -4.967951774597168,
"logps/chosen": -0.4330342710018158,
"logps/rejected": -0.4283411204814911,
"loss": 4.7034,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.330342769622803,
"rewards/margins": -0.0469314381480217,
"rewards/rejected": -4.283411026000977,
"step": 38
},
{
"epoch": 0.13430908308222125,
"grad_norm": 57.08486563674846,
"learning_rate": 7.971058281676275e-07,
"logits/chosen": -5.094006061553955,
"logits/rejected": -5.191053867340088,
"logps/chosen": -0.4875888228416443,
"logps/rejected": -0.6387084722518921,
"loss": 4.6644,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.875887870788574,
"rewards/margins": 1.5111969709396362,
"rewards/rejected": -6.3870849609375,
"step": 39
},
{
"epoch": 0.13775290572535515,
"grad_norm": 60.77964329495937,
"learning_rate": 7.964989401121432e-07,
"logits/chosen": -4.993417739868164,
"logits/rejected": -4.969631195068359,
"logps/chosen": -0.33883655071258545,
"logps/rejected": -0.3545013964176178,
"loss": 4.5686,
"rewards/accuracies": 0.5625,
"rewards/chosen": -3.3883657455444336,
"rewards/margins": 0.1566484272480011,
"rewards/rejected": -3.5450141429901123,
"step": 40
},
{
"epoch": 0.141196728368489,
"grad_norm": 40.340190041907185,
"learning_rate": 7.958346066222549e-07,
"logits/chosen": -4.525943756103516,
"logits/rejected": -4.55746603012085,
"logps/chosen": -0.45404478907585144,
"logps/rejected": -0.4469107389450073,
"loss": 4.5937,
"rewards/accuracies": 0.5625,
"rewards/chosen": -4.54044771194458,
"rewards/margins": -0.07134075462818146,
"rewards/rejected": -4.469107151031494,
"step": 41
},
{
"epoch": 0.1446405510116229,
"grad_norm": 57.514362166929665,
"learning_rate": 7.951129239477177e-07,
"logits/chosen": -5.132482528686523,
"logits/rejected": -5.1113786697387695,
"logps/chosen": -0.401109516620636,
"logps/rejected": -0.44438689947128296,
"loss": 4.6176,
"rewards/accuracies": 0.4375,
"rewards/chosen": -4.01109504699707,
"rewards/margins": 0.43277424573898315,
"rewards/rejected": -4.443869113922119,
"step": 42
},
{
"epoch": 0.14808437365475677,
"grad_norm": 60.34883135001456,
"learning_rate": 7.943339966471333e-07,
"logits/chosen": -4.517858982086182,
"logits/rejected": -4.421618461608887,
"logps/chosen": -0.6845810413360596,
"logps/rejected": -0.6314383745193481,
"loss": 4.6897,
"rewards/accuracies": 0.5,
"rewards/chosen": -6.8458099365234375,
"rewards/margins": -0.531426191329956,
"rewards/rejected": -6.3143839836120605,
"step": 43
},
{
"epoch": 0.15152819629789066,
"grad_norm": 65.95554033423765,
"learning_rate": 7.93497937572801e-07,
"logits/chosen": -5.128730297088623,
"logits/rejected": -5.0609660148620605,
"logps/chosen": -0.5347275137901306,
"logps/rejected": -0.5926434397697449,
"loss": 4.7481,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.3472747802734375,
"rewards/margins": 0.5791594386100769,
"rewards/rejected": -5.926434516906738,
"step": 44
},
{
"epoch": 0.15497201894102453,
"grad_norm": 65.36255184426133,
"learning_rate": 7.926048678543684e-07,
"logits/chosen": -4.324880599975586,
"logits/rejected": -4.221179485321045,
"logps/chosen": -0.5375354290008545,
"logps/rejected": -0.7417870163917542,
"loss": 4.4532,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.375354766845703,
"rewards/margins": 2.042515277862549,
"rewards/rejected": -7.417870044708252,
"step": 45
},
{
"epoch": 0.15841584158415842,
"grad_norm": 56.19140828662666,
"learning_rate": 7.916549168812805e-07,
"logits/chosen": -4.412731647491455,
"logits/rejected": -4.406851768493652,
"logps/chosen": -0.43062710762023926,
"logps/rejected": -0.5201414227485657,
"loss": 4.4945,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.306270599365234,
"rewards/margins": 0.8951433897018433,
"rewards/rejected": -5.201414585113525,
"step": 46
},
{
"epoch": 0.16185966422729228,
"grad_norm": 61.47712790834795,
"learning_rate": 7.906482222840346e-07,
"logits/chosen": -3.994800329208374,
"logits/rejected": -3.9059207439422607,
"logps/chosen": -0.4685822129249573,
"logps/rejected": -0.6291300058364868,
"loss": 4.5688,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.685822010040283,
"rewards/margins": 1.6054778099060059,
"rewards/rejected": -6.291299819946289,
"step": 47
},
{
"epoch": 0.16530348687042618,
"grad_norm": 71.97293938631553,
"learning_rate": 7.8958492991424e-07,
"logits/chosen": -4.644060134887695,
"logits/rejected": -4.552207946777344,
"logps/chosen": -0.49269717931747437,
"logps/rejected": -0.48811206221580505,
"loss": 4.3686,
"rewards/accuracies": 0.5,
"rewards/chosen": -4.926971435546875,
"rewards/margins": -0.04585088789463043,
"rewards/rejected": -4.881120204925537,
"step": 48
},
{
"epoch": 0.16874730951356004,
"grad_norm": 50.91272883112091,
"learning_rate": 7.884651938234865e-07,
"logits/chosen": -4.6048712730407715,
"logits/rejected": -4.637516975402832,
"logps/chosen": -0.454245924949646,
"logps/rejected": -0.5149778723716736,
"loss": 4.4144,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.542459011077881,
"rewards/margins": 0.6073201894760132,
"rewards/rejected": -5.149779796600342,
"step": 49
},
{
"epoch": 0.17219113215669393,
"grad_norm": 70.84729258897266,
"learning_rate": 7.872891762410253e-07,
"logits/chosen": -4.788956642150879,
"logits/rejected": -4.830476760864258,
"logps/chosen": -0.5271515846252441,
"logps/rejected": -0.5558156967163086,
"loss": 4.398,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.271515846252441,
"rewards/margins": 0.2866411805152893,
"rewards/rejected": -5.558156967163086,
"step": 50
},
{
"epoch": 0.1756349547998278,
"grad_norm": 72.43110951103814,
"learning_rate": 7.860570475502648e-07,
"logits/chosen": -4.508288860321045,
"logits/rejected": -4.559998035430908,
"logps/chosen": -0.4371810257434845,
"logps/rejected": -0.5790078639984131,
"loss": 4.4228,
"rewards/accuracies": 0.625,
"rewards/chosen": -4.371809959411621,
"rewards/margins": 1.4182684421539307,
"rewards/rejected": -5.790079116821289,
"step": 51
},
{
"epoch": 0.1790787774429617,
"grad_norm": 55.410292700087545,
"learning_rate": 7.847689862640855e-07,
"logits/chosen": -4.518070697784424,
"logits/rejected": -4.57796049118042,
"logps/chosen": -0.4647026062011719,
"logps/rejected": -0.5196883082389832,
"loss": 4.7694,
"rewards/accuracies": 0.5625,
"rewards/chosen": -4.647026062011719,
"rewards/margins": 0.5498570203781128,
"rewards/rejected": -5.196883201599121,
"step": 52
},
{
"epoch": 0.18252260008609555,
"grad_norm": 69.37009413960385,
"learning_rate": 7.834251789989765e-07,
"logits/chosen": -4.978256702423096,
"logits/rejected": -4.886575698852539,
"logps/chosen": -0.5333456993103027,
"logps/rejected": -0.7749611139297485,
"loss": 4.6385,
"rewards/accuracies": 0.5625,
"rewards/chosen": -5.333456516265869,
"rewards/margins": 2.4161548614501953,
"rewards/rejected": -7.7496113777160645,
"step": 53
},
{
"epoch": 0.18596642272922945,
"grad_norm": 72.61257423821304,
"learning_rate": 7.820258204479982e-07,
"logits/chosen": -4.223357677459717,
"logits/rejected": -4.151899337768555,
"logps/chosen": -0.5688156485557556,
"logps/rejected": -0.6057307124137878,
"loss": 4.811,
"rewards/accuracies": 0.5,
"rewards/chosen": -5.688156604766846,
"rewards/margins": 0.3691507577896118,
"rewards/rejected": -6.05730676651001,
"step": 54
},
{
"epoch": 0.1894102453723633,
"grad_norm": 56.657706476039074,
"learning_rate": 7.805711133525747e-07,
"logits/chosen": -4.470883846282959,
"logits/rejected": -4.288090705871582,
"logps/chosen": -0.6821640729904175,
"logps/rejected": -0.6564118266105652,
"loss": 4.6001,
"rewards/accuracies": 0.625,
"rewards/chosen": -6.8216400146484375,
"rewards/margins": -0.25752171874046326,
"rewards/rejected": -6.564118385314941,
"step": 55
},
{
"epoch": 0.1928540680154972,
"grad_norm": 78.23314078065798,
"learning_rate": 7.790612684731209e-07,
"logits/chosen": -4.282840728759766,
"logits/rejected": -4.223234176635742,
"logps/chosen": -0.6843351721763611,
"logps/rejected": -0.8569565415382385,
"loss": 4.456,
"rewards/accuracies": 0.625,
"rewards/chosen": -6.843351364135742,
"rewards/margins": 1.7262136936187744,
"rewards/rejected": -8.569564819335938,
"step": 56
},
{
"epoch": 0.19629789065863107,
"grad_norm": 64.08169300905095,
"learning_rate": 7.774965045585064e-07,
"logits/chosen": -5.029541015625,
"logits/rejected": -5.061357498168945,
"logps/chosen": -0.5916852951049805,
"logps/rejected": -0.6251527667045593,
"loss": 4.3484,
"rewards/accuracies": 0.625,
"rewards/chosen": -5.916852951049805,
"rewards/margins": 0.334674596786499,
"rewards/rejected": -6.251528263092041,
"step": 57
},
{
"epoch": 0.19974171330176496,
"grad_norm": 62.02026429799981,
"learning_rate": 7.758770483143634e-07,
"logits/chosen": -3.820904016494751,
"logits/rejected": -3.8994665145874023,
"logps/chosen": -0.6398332118988037,
"logps/rejected": -0.662560760974884,
"loss": 4.3431,
"rewards/accuracies": 0.5625,
"rewards/chosen": -6.398331642150879,
"rewards/margins": 0.22727595269680023,
"rewards/rejected": -6.625607967376709,
"step": 58
},
{
"epoch": 0.20318553594489883,
"grad_norm": 57.111582977153155,
"learning_rate": 7.742031343702404e-07,
"logits/chosen": -4.509333610534668,
"logits/rejected": -4.401131629943848,
"logps/chosen": -0.5554917454719543,
"logps/rejected": -0.6513252854347229,
"loss": 4.1657,
"rewards/accuracies": 0.8125,
"rewards/chosen": -5.554917335510254,
"rewards/margins": 0.9583351016044617,
"rewards/rejected": -6.5132527351379395,
"step": 59
},
{
"epoch": 0.20662935858803272,
"grad_norm": 68.28922293268536,
"learning_rate": 7.724750052456098e-07,
"logits/chosen": -4.062650680541992,
"logits/rejected": -3.9956672191619873,
"logps/chosen": -0.5649631023406982,
"logps/rejected": -0.7722354531288147,
"loss": 4.3439,
"rewards/accuracies": 0.75,
"rewards/chosen": -5.649630546569824,
"rewards/margins": 2.072723627090454,
"rewards/rejected": -7.722353935241699,
"step": 60
},
{
"epoch": 0.21007318123116658,
"grad_norm": 65.73305092854736,
"learning_rate": 7.706929113147304e-07,
"logits/chosen": -4.709454536437988,
"logits/rejected": -4.698660850524902,
"logps/chosen": -0.6076084971427917,
"logps/rejected": -0.6684498190879822,
"loss": 4.2227,
"rewards/accuracies": 0.6875,
"rewards/chosen": -6.076085090637207,
"rewards/margins": 0.6084132790565491,
"rewards/rejected": -6.684497833251953,
"step": 61
},
{
"epoch": 0.21351700387430048,
"grad_norm": 67.46265604716064,
"learning_rate": 7.688571107703732e-07,
"logits/chosen": -3.963956832885742,
"logits/rejected": -3.938755512237549,
"logps/chosen": -0.5723408460617065,
"logps/rejected": -0.5089117288589478,
"loss": 4.4227,
"rewards/accuracies": 0.4375,
"rewards/chosen": -5.7234086990356445,
"rewards/margins": -0.6342912316322327,
"rewards/rejected": -5.089117527008057,
"step": 62
},
{
"epoch": 0.21696082651743434,
"grad_norm": 60.086542404476894,
"learning_rate": 7.669678695864137e-07,
"logits/chosen": -4.414982795715332,
"logits/rejected": -4.424773693084717,
"logps/chosen": -0.7808203101158142,
"logps/rejected": -0.9502580761909485,
"loss": 4.1876,
"rewards/accuracies": 0.625,
"rewards/chosen": -7.80820369720459,
"rewards/margins": 1.6943775415420532,
"rewards/rejected": -9.502581596374512,
"step": 63
},
{
"epoch": 0.22040464916056823,
"grad_norm": 62.22132599200314,
"learning_rate": 7.650254614792972e-07,
"logits/chosen": -5.100131511688232,
"logits/rejected": -4.888442039489746,
"logps/chosen": -0.7664632797241211,
"logps/rejected": -0.7097909450531006,
"loss": 4.0675,
"rewards/accuracies": 0.5625,
"rewards/chosen": -7.664633274078369,
"rewards/margins": -0.5667227506637573,
"rewards/rejected": -7.097909927368164,
"step": 64
},
{
"epoch": 0.2238484718037021,
"grad_norm": 64.6980516756494,
"learning_rate": 7.630301678683828e-07,
"logits/chosen": -4.501206398010254,
"logits/rejected": -4.3760600090026855,
"logps/chosen": -0.582870602607727,
"logps/rejected": -0.7515184283256531,
"loss": 3.8879,
"rewards/accuracies": 0.6875,
"rewards/chosen": -5.82870626449585,
"rewards/margins": 1.686477780342102,
"rewards/rejected": -7.51518440246582,
"step": 65
},
{
"epoch": 0.227292294446836,
"grad_norm": 58.49975168550397,
"learning_rate": 7.6098227783517e-07,
"logits/chosen": -4.590901851654053,
"logits/rejected": -4.614831447601318,
"logps/chosen": -0.6885466575622559,
"logps/rejected": -0.6569955945014954,
"loss": 4.2328,
"rewards/accuracies": 0.4375,
"rewards/chosen": -6.885467052459717,
"rewards/margins": -0.31551113724708557,
"rewards/rejected": -6.569955825805664,
"step": 66
},
{
"epoch": 0.23073611708996986,
"grad_norm": 71.25690755989906,
"learning_rate": 7.588820880814168e-07,
"logits/chosen": -4.404972553253174,
"logits/rejected": -4.322005271911621,
"logps/chosen": -0.7880414128303528,
"logps/rejected": -0.8913244605064392,
"loss": 4.4685,
"rewards/accuracies": 0.625,
"rewards/chosen": -7.8804144859313965,
"rewards/margins": 1.032829999923706,
"rewards/rejected": -8.913244247436523,
"step": 67
},
{
"epoch": 0.23417993973310375,
"grad_norm": 74.45643519661347,
"learning_rate": 7.567299028861528e-07,
"logits/chosen": -5.07747220993042,
"logits/rejected": -4.910668849945068,
"logps/chosen": -0.8106582760810852,
"logps/rejected": -0.8454375863075256,
"loss": 4.2067,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.106582641601562,
"rewards/margins": 0.3477928638458252,
"rewards/rejected": -8.454376220703125,
"step": 68
},
{
"epoch": 0.2376237623762376,
"grad_norm": 61.17870132728996,
"learning_rate": 7.54526034061595e-07,
"logits/chosen": -4.368528842926025,
"logits/rejected": -4.182857513427734,
"logps/chosen": -0.7671667337417603,
"logps/rejected": -0.8781678676605225,
"loss": 4.0971,
"rewards/accuracies": 0.625,
"rewards/chosen": -7.67166805267334,
"rewards/margins": 1.110011100769043,
"rewards/rejected": -8.781679153442383,
"step": 69
},
{
"epoch": 0.2410675850193715,
"grad_norm": 80.03873573932812,
"learning_rate": 7.522708009079711e-07,
"logits/chosen": -3.757272720336914,
"logits/rejected": -3.6177382469177246,
"logps/chosen": -0.7591115832328796,
"logps/rejected": -1.0031275749206543,
"loss": 4.4004,
"rewards/accuracies": 0.6875,
"rewards/chosen": -7.591116905212402,
"rewards/margins": 2.4401588439941406,
"rewards/rejected": -10.031274795532227,
"step": 70
},
{
"epoch": 0.24451140766250537,
"grad_norm": 54.78863193631618,
"learning_rate": 7.499645301672599e-07,
"logits/chosen": -4.391002655029297,
"logits/rejected": -4.642823696136475,
"logps/chosen": -0.8277568817138672,
"logps/rejected": -0.8874188661575317,
"loss": 4.0832,
"rewards/accuracies": 0.6875,
"rewards/chosen": -8.277568817138672,
"rewards/margins": 0.5966211557388306,
"rewards/rejected": -8.874189376831055,
"step": 71
},
{
"epoch": 0.24795523030563926,
"grad_norm": 68.33693738907787,
"learning_rate": 7.476075559758513e-07,
"logits/chosen": -4.277254581451416,
"logits/rejected": -4.10576057434082,
"logps/chosen": -0.6340219378471375,
"logps/rejected": -0.8107688426971436,
"loss": 4.3541,
"rewards/accuracies": 0.6875,
"rewards/chosen": -6.340219974517822,
"rewards/margins": 1.767467975616455,
"rewards/rejected": -8.107687950134277,
"step": 72
},
{
"epoch": 0.2513990529487731,
"grad_norm": 56.379646600357916,
"learning_rate": 7.452002198161371e-07,
"logits/chosen": -4.682867050170898,
"logits/rejected": -4.608969211578369,
"logps/chosen": -0.7252380847930908,
"logps/rejected": -0.8175498247146606,
"loss": 3.8474,
"rewards/accuracies": 0.5625,
"rewards/chosen": -7.25238037109375,
"rewards/margins": 0.9231181144714355,
"rewards/rejected": -8.175498962402344,
"step": 73
},
{
"epoch": 0.254842875591907,
"grad_norm": 100.90328367426899,
"learning_rate": 7.427428704670356e-07,
"logits/chosen": -4.861872673034668,
"logits/rejected": -4.656722545623779,
"logps/chosen": -0.7617427706718445,
"logps/rejected": -0.9613173007965088,
"loss": 4.4928,
"rewards/accuracies": 0.625,
"rewards/chosen": -7.617427825927734,
"rewards/margins": 1.9957445859909058,
"rewards/rejected": -9.61317253112793,
"step": 74
},
{
"epoch": 0.2582866982350409,
"grad_norm": 65.6279612427127,
"learning_rate": 7.402358639534602e-07,
"logits/chosen": -5.1001877784729,
"logits/rejected": -5.059464454650879,
"logps/chosen": -0.6768380403518677,
"logps/rejected": -0.8699934482574463,
"loss": 4.1233,
"rewards/accuracies": 0.625,
"rewards/chosen": -6.768380641937256,
"rewards/margins": 1.931553840637207,
"rewards/rejected": -8.699934005737305,
"step": 75
},
{
"epoch": 0.2617305208781748,
"grad_norm": 69.40821628799613,
"learning_rate": 7.376795634947379e-07,
"logits/chosen": -4.4171576499938965,
"logits/rejected": -4.2465434074401855,
"logps/chosen": -0.7788955569267273,
"logps/rejected": -0.8167555332183838,
"loss": 4.309,
"rewards/accuracies": 0.75,
"rewards/chosen": -7.788956165313721,
"rewards/margins": 0.37859874963760376,
"rewards/rejected": -8.16755485534668,
"step": 76
},
{
"epoch": 0.26517434352130864,
"grad_norm": 69.16350786587172,
"learning_rate": 7.350743394519858e-07,
"logits/chosen": -4.930624485015869,
"logits/rejected": -4.70862340927124,
"logps/chosen": -0.8845140933990479,
"logps/rejected": -0.9442533850669861,
"loss": 4.1944,
"rewards/accuracies": 0.625,
"rewards/chosen": -8.845142364501953,
"rewards/margins": 0.5973912477493286,
"rewards/rejected": -9.442534446716309,
"step": 77
},
{
"epoch": 0.2686181661644425,
"grad_norm": 67.06499369198696,
"learning_rate": 7.324205692744521e-07,
"logits/chosen": -5.08651065826416,
"logits/rejected": -5.048566818237305,
"logps/chosen": -0.672334611415863,
"logps/rejected": -0.7581319808959961,
"loss": 4.2669,
"rewards/accuracies": 0.4375,
"rewards/chosen": -6.723345756530762,
"rewards/margins": 0.8579738140106201,
"rewards/rejected": -7.581319808959961,
"step": 78
},
{
"epoch": 0.2720619888075764,
"grad_norm": 85.80640569218886,
"learning_rate": 7.297186374448307e-07,
"logits/chosen": -5.137825012207031,
"logits/rejected": -5.172469139099121,
"logps/chosen": -0.9155557155609131,
"logps/rejected": -1.0751527547836304,
"loss": 4.1234,
"rewards/accuracies": 0.75,
"rewards/chosen": -9.155557632446289,
"rewards/margins": 1.5959699153900146,
"rewards/rejected": -10.751527786254883,
"step": 79
},
{
"epoch": 0.2755058114507103,
"grad_norm": 63.74795493043287,
"learning_rate": 7.269689354235567e-07,
"logits/chosen": -5.289166450500488,
"logits/rejected": -4.827259540557861,
"logps/chosen": -0.7461143136024475,
"logps/rejected": -1.00174081325531,
"loss": 3.6397,
"rewards/accuracies": 0.8125,
"rewards/chosen": -7.461143493652344,
"rewards/margins": 2.5562655925750732,
"rewards/rejected": -10.01740837097168,
"step": 80
},
{
"epoch": 0.27894963409384416,
"grad_norm": 63.4036066323074,
"learning_rate": 7.241718615920916e-07,
"logits/chosen": -5.0095415115356445,
"logits/rejected": -4.8333353996276855,
"logps/chosen": -0.8599931597709656,
"logps/rejected": -1.064732551574707,
"loss": 3.9659,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.599931716918945,
"rewards/margins": 2.0473945140838623,
"rewards/rejected": -10.647326469421387,
"step": 81
},
{
"epoch": 0.282393456736978,
"grad_norm": 62.58097683888751,
"learning_rate": 7.213278211952038e-07,
"logits/chosen": -4.466184139251709,
"logits/rejected": -4.107361793518066,
"logps/chosen": -0.7377562522888184,
"logps/rejected": -0.9077808856964111,
"loss": 3.8192,
"rewards/accuracies": 0.8125,
"rewards/chosen": -7.377562046051025,
"rewards/margins": 1.7002463340759277,
"rewards/rejected": -9.07780933380127,
"step": 82
},
{
"epoch": 0.28583727938011194,
"grad_norm": 71.46678574585364,
"learning_rate": 7.184372262822574e-07,
"logits/chosen": -4.615472793579102,
"logits/rejected": -4.519737243652344,
"logps/chosen": -0.8602911233901978,
"logps/rejected": -0.9075096845626831,
"loss": 4.0224,
"rewards/accuracies": 0.5625,
"rewards/chosen": -8.602910995483398,
"rewards/margins": 0.47218504548072815,
"rewards/rejected": -9.07509708404541,
"step": 83
},
{
"epoch": 0.2892811020232458,
"grad_norm": 76.77633086094144,
"learning_rate": 7.155004956475131e-07,
"logits/chosen": -5.291561126708984,
"logits/rejected": -4.816816329956055,
"logps/chosen": -0.7795137166976929,
"logps/rejected": -0.9014157056808472,
"loss": 3.9316,
"rewards/accuracies": 0.8125,
"rewards/chosen": -7.79513692855835,
"rewards/margins": 1.2190203666687012,
"rewards/rejected": -9.01415729522705,
"step": 84
},
{
"epoch": 0.29272492466637967,
"grad_norm": 64.07924704882033,
"learning_rate": 7.125180547694526e-07,
"logits/chosen": -5.0156683921813965,
"logits/rejected": -4.72418737411499,
"logps/chosen": -0.8232897520065308,
"logps/rejected": -1.23880136013031,
"loss": 3.6287,
"rewards/accuracies": 0.6875,
"rewards/chosen": -8.23289680480957,
"rewards/margins": 4.155117034912109,
"rewards/rejected": -12.388014793395996,
"step": 85
},
{
"epoch": 0.29616874730951354,
"grad_norm": 60.650149825273516,
"learning_rate": 7.094903357491345e-07,
"logits/chosen": -4.864440441131592,
"logits/rejected": -4.457652568817139,
"logps/chosen": -0.8692309260368347,
"logps/rejected": -1.0946143865585327,
"loss": 3.9978,
"rewards/accuracies": 0.8125,
"rewards/chosen": -8.69230842590332,
"rewards/margins": 2.2538340091705322,
"rewards/rejected": -10.94614315032959,
"step": 86
},
{
"epoch": 0.29961256995264746,
"grad_norm": 57.554296714602465,
"learning_rate": 7.064177772475911e-07,
"logits/chosen": -5.011836528778076,
"logits/rejected": -5.0129570960998535,
"logps/chosen": -0.9953622221946716,
"logps/rejected": -1.1730215549468994,
"loss": 3.945,
"rewards/accuracies": 0.625,
"rewards/chosen": -9.953622817993164,
"rewards/margins": 1.7765934467315674,
"rewards/rejected": -11.730216026306152,
"step": 87
},
{
"epoch": 0.3030563925957813,
"grad_norm": 76.32625861331243,
"learning_rate": 7.033008244222745e-07,
"logits/chosen": -5.204478740692139,
"logits/rejected": -4.811039924621582,
"logps/chosen": -1.0616154670715332,
"logps/rejected": -1.0835515260696411,
"loss": 3.803,
"rewards/accuracies": 0.6875,
"rewards/chosen": -10.616154670715332,
"rewards/margins": 0.2193598747253418,
"rewards/rejected": -10.835514068603516,
"step": 88
},
{
"epoch": 0.3065002152389152,
"grad_norm": 74.9886645079608,
"learning_rate": 7.001399288625609e-07,
"logits/chosen": -5.231860637664795,
"logits/rejected": -4.674942970275879,
"logps/chosen": -0.9697386026382446,
"logps/rejected": -1.13163423538208,
"loss": 3.7676,
"rewards/accuracies": 0.6875,
"rewards/chosen": -9.697385787963867,
"rewards/margins": 1.6189574003219604,
"rewards/rejected": -11.316343307495117,
"step": 89
},
{
"epoch": 0.30994403788204905,
"grad_norm": 103.40467284986448,
"learning_rate": 6.969355485243239e-07,
"logits/chosen": -5.283835411071777,
"logits/rejected": -5.210239410400391,
"logps/chosen": -0.9744136929512024,
"logps/rejected": -1.04610276222229,
"loss": 4.0352,
"rewards/accuracies": 0.4375,
"rewards/chosen": -9.744136810302734,
"rewards/margins": 0.7168899774551392,
"rewards/rejected": -10.461027145385742,
"step": 90
},
{
"epoch": 0.31338786052518297,
"grad_norm": 75.93291524225512,
"learning_rate": 6.936881476635852e-07,
"logits/chosen": -6.081892013549805,
"logits/rejected": -5.807435989379883,
"logps/chosen": -1.129875898361206,
"logps/rejected": -1.387671709060669,
"loss": 4.1486,
"rewards/accuracies": 0.6875,
"rewards/chosen": -11.298759460449219,
"rewards/margins": 2.5779573917388916,
"rewards/rejected": -13.876716613769531,
"step": 91
},
{
"epoch": 0.31683168316831684,
"grad_norm": 81.2673779492731,
"learning_rate": 6.903981967692524e-07,
"logits/chosen": -5.27292013168335,
"logits/rejected": -4.817174911499023,
"logps/chosen": -0.9680742621421814,
"logps/rejected": -1.3815770149230957,
"loss": 3.5383,
"rewards/accuracies": 0.875,
"rewards/chosen": -9.680743217468262,
"rewards/margins": 4.135027885437012,
"rewards/rejected": -13.815771102905273,
"step": 92
},
{
"epoch": 0.3202755058114507,
"grad_norm": 74.43824451243964,
"learning_rate": 6.870661724949532e-07,
"logits/chosen": -5.829610824584961,
"logits/rejected": -5.776001453399658,
"logps/chosen": -0.9559181928634644,
"logps/rejected": -1.1338019371032715,
"loss": 3.772,
"rewards/accuracies": 0.6875,
"rewards/chosen": -9.559182167053223,
"rewards/margins": 1.778836965560913,
"rewards/rejected": -11.338018417358398,
"step": 93
},
{
"epoch": 0.32371932845458457,
"grad_norm": 80.85750752674423,
"learning_rate": 6.836925575899777e-07,
"logits/chosen": -5.458807468414307,
"logits/rejected": -5.102845668792725,
"logps/chosen": -1.3218635320663452,
"logps/rejected": -1.4797770977020264,
"loss": 3.6527,
"rewards/accuracies": 0.75,
"rewards/chosen": -13.218635559082031,
"rewards/margins": 1.5791367292404175,
"rewards/rejected": -14.797771453857422,
"step": 94
},
{
"epoch": 0.3271631510977185,
"grad_norm": 91.67035563126446,
"learning_rate": 6.802778408293369e-07,
"logits/chosen": -6.600034713745117,
"logits/rejected": -5.972718238830566,
"logps/chosen": -1.1001228094100952,
"logps/rejected": -1.3483161926269531,
"loss": 3.2161,
"rewards/accuracies": 0.8125,
"rewards/chosen": -11.001227378845215,
"rewards/margins": 2.4819343090057373,
"rewards/rejected": -13.483161926269531,
"step": 95
},
{
"epoch": 0.33060697374085235,
"grad_norm": 82.93501550536419,
"learning_rate": 6.768225169429477e-07,
"logits/chosen": -5.710722923278809,
"logits/rejected": -5.354726314544678,
"logps/chosen": -1.1976306438446045,
"logps/rejected": -1.6553398370742798,
"loss": 3.474,
"rewards/accuracies": 0.75,
"rewards/chosen": -11.976305961608887,
"rewards/margins": 4.577092170715332,
"rewards/rejected": -16.55339813232422,
"step": 96
},
{
"epoch": 0.3340507963839862,
"grad_norm": 83.33902780022994,
"learning_rate": 6.733270865439557e-07,
"logits/chosen": -6.448300361633301,
"logits/rejected": -6.226202487945557,
"logps/chosen": -1.595934510231018,
"logps/rejected": -1.6069419384002686,
"loss": 3.8929,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.959344863891602,
"rewards/margins": 0.11007285118103027,
"rewards/rejected": -16.069419860839844,
"step": 97
},
{
"epoch": 0.3374946190271201,
"grad_norm": 117.67802338372276,
"learning_rate": 6.697920560562055e-07,
"logits/chosen": -6.556612968444824,
"logits/rejected": -6.181111812591553,
"logps/chosen": -1.4487080574035645,
"logps/rejected": -1.8549811840057373,
"loss": 3.6244,
"rewards/accuracies": 0.6875,
"rewards/chosen": -14.487081527709961,
"rewards/margins": 4.062728404998779,
"rewards/rejected": -18.549808502197266,
"step": 98
},
{
"epoch": 0.340938441670254,
"grad_norm": 93.0037276638684,
"learning_rate": 6.662179376408698e-07,
"logits/chosen": -7.180575370788574,
"logits/rejected": -6.442221641540527,
"logps/chosen": -1.184888243675232,
"logps/rejected": -1.4313077926635742,
"loss": 2.8886,
"rewards/accuracies": 0.5625,
"rewards/chosen": -11.848883628845215,
"rewards/margins": 2.4641964435577393,
"rewards/rejected": -14.313077926635742,
"step": 99
},
{
"epoch": 0.34438226431338786,
"grad_norm": 103.70431976296841,
"learning_rate": 6.626052491222453e-07,
"logits/chosen": -7.366156101226807,
"logits/rejected": -6.646521091461182,
"logps/chosen": -1.50858736038208,
"logps/rejected": -1.6617248058319092,
"loss": 3.8069,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.0858736038208,
"rewards/margins": 1.5313715934753418,
"rewards/rejected": -16.617244720458984,
"step": 100
},
{
"epoch": 0.34782608695652173,
"grad_norm": 97.56464703755402,
"learning_rate": 6.589545139127311e-07,
"logits/chosen": -6.810091972351074,
"logits/rejected": -6.6775031089782715,
"logps/chosen": -1.1999897956848145,
"logps/rejected": -1.4419368505477905,
"loss": 3.0443,
"rewards/accuracies": 0.625,
"rewards/chosen": -11.999898910522461,
"rewards/margins": 2.4194700717926025,
"rewards/rejected": -14.419368743896484,
"step": 101
},
{
"epoch": 0.3512699095996556,
"grad_norm": 109.40818246650933,
"learning_rate": 6.552662609369942e-07,
"logits/chosen": -9.70158576965332,
"logits/rejected": -9.41241455078125,
"logps/chosen": -1.6020368337631226,
"logps/rejected": -1.8466899394989014,
"loss": 3.8531,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.020368576049805,
"rewards/margins": 2.446530818939209,
"rewards/rejected": -18.466899871826172,
"step": 102
},
{
"epoch": 0.3547137322427895,
"grad_norm": 120.68703068267112,
"learning_rate": 6.515410245553393e-07,
"logits/chosen": -9.626636505126953,
"logits/rejected": -8.800621032714844,
"logps/chosen": -1.5177563428878784,
"logps/rejected": -2.0786399841308594,
"loss": 3.3957,
"rewards/accuracies": 0.875,
"rewards/chosen": -15.177563667297363,
"rewards/margins": 5.608834266662598,
"rewards/rejected": -20.78639793395996,
"step": 103
},
{
"epoch": 0.3581575548859234,
"grad_norm": 115.8693954281069,
"learning_rate": 6.477793444862892e-07,
"logits/chosen": -8.715924263000488,
"logits/rejected": -8.530646324157715,
"logps/chosen": -1.4800488948822021,
"logps/rejected": -1.7589524984359741,
"loss": 3.1903,
"rewards/accuracies": 0.75,
"rewards/chosen": -14.800487518310547,
"rewards/margins": 2.7890357971191406,
"rewards/rejected": -17.589523315429688,
"step": 104
},
{
"epoch": 0.36160137752905724,
"grad_norm": 152.94221871551687,
"learning_rate": 6.439817657283891e-07,
"logits/chosen": -9.968289375305176,
"logits/rejected": -9.650674819946289,
"logps/chosen": -1.2602545022964478,
"logps/rejected": -1.6873594522476196,
"loss": 3.4884,
"rewards/accuracies": 0.75,
"rewards/chosen": -12.602544784545898,
"rewards/margins": 4.271048545837402,
"rewards/rejected": -16.873594284057617,
"step": 105
},
{
"epoch": 0.3650452001721911,
"grad_norm": 105.58797890414657,
"learning_rate": 6.401488384812473e-07,
"logits/chosen": -9.584343910217285,
"logits/rejected": -9.589265823364258,
"logps/chosen": -1.5995938777923584,
"logps/rejected": -1.774956226348877,
"loss": 3.6783,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.995938301086426,
"rewards/margins": 1.7536234855651855,
"rewards/rejected": -17.749563217163086,
"step": 106
},
{
"epoch": 0.36848902281532503,
"grad_norm": 136.93293579796645,
"learning_rate": 6.362811180658203e-07,
"logits/chosen": -10.062201499938965,
"logits/rejected": -9.910536766052246,
"logps/chosen": -1.5935949087142944,
"logps/rejected": -1.9505418539047241,
"loss": 3.243,
"rewards/accuracies": 0.75,
"rewards/chosen": -15.93595027923584,
"rewards/margins": 3.5694689750671387,
"rewards/rejected": -19.505420684814453,
"step": 107
},
{
"epoch": 0.3719328454584589,
"grad_norm": 116.29861634629408,
"learning_rate": 6.323791648439579e-07,
"logits/chosen": -9.214845657348633,
"logits/rejected": -8.844350814819336,
"logps/chosen": -1.5258371829986572,
"logps/rejected": -1.9718682765960693,
"loss": 3.2289,
"rewards/accuracies": 0.8125,
"rewards/chosen": -15.258371353149414,
"rewards/margins": 4.460310935974121,
"rewards/rejected": -19.71868324279785,
"step": 108
},
{
"epoch": 0.37537666810159276,
"grad_norm": 155.48156040186888,
"learning_rate": 6.284435441372161e-07,
"logits/chosen": -11.504440307617188,
"logits/rejected": -10.832094192504883,
"logps/chosen": -1.9326715469360352,
"logps/rejected": -2.6018829345703125,
"loss": 3.0293,
"rewards/accuracies": 0.75,
"rewards/chosen": -19.32671546936035,
"rewards/margins": 6.692113876342773,
"rewards/rejected": -26.018831253051758,
"step": 109
},
{
"epoch": 0.3788204907447266,
"grad_norm": 136.87645547171363,
"learning_rate": 6.244748261449529e-07,
"logits/chosen": -11.773118019104004,
"logits/rejected": -11.45300006866455,
"logps/chosen": -1.7673592567443848,
"logps/rejected": -2.0853307247161865,
"loss": 2.9492,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.67359161376953,
"rewards/margins": 3.179716110229492,
"rewards/rejected": -20.853307723999023,
"step": 110
},
{
"epoch": 0.38226431338786054,
"grad_norm": 118.1199403678312,
"learning_rate": 6.204735858617171e-07,
"logits/chosen": -11.518077850341797,
"logits/rejected": -10.808693885803223,
"logps/chosen": -1.9174708127975464,
"logps/rejected": -2.1425564289093018,
"loss": 3.0723,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.174705505371094,
"rewards/margins": 2.250857353210449,
"rewards/rejected": -21.425565719604492,
"step": 111
},
{
"epoch": 0.3857081360309944,
"grad_norm": 119.80139407969132,
"learning_rate": 6.164404029939416e-07,
"logits/chosen": -11.800997734069824,
"logits/rejected": -11.651061058044434,
"logps/chosen": -1.7692383527755737,
"logps/rejected": -2.131488800048828,
"loss": 3.1498,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.692384719848633,
"rewards/margins": 3.6225037574768066,
"rewards/rejected": -21.31488800048828,
"step": 112
},
{
"epoch": 0.3891519586741283,
"grad_norm": 163.89733948610942,
"learning_rate": 6.123758618759547e-07,
"logits/chosen": -11.592788696289062,
"logits/rejected": -11.94422435760498,
"logps/chosen": -2.020418405532837,
"logps/rejected": -2.4360499382019043,
"loss": 3.0169,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.204185485839844,
"rewards/margins": 4.156314849853516,
"rewards/rejected": -24.36050033569336,
"step": 113
},
{
"epoch": 0.39259578131726214,
"grad_norm": 143.8629447599346,
"learning_rate": 6.082805513853209e-07,
"logits/chosen": -12.300226211547852,
"logits/rejected": -11.13952350616455,
"logps/chosen": -1.6513843536376953,
"logps/rejected": -2.2150726318359375,
"loss": 2.7393,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.513843536376953,
"rewards/margins": 5.636881351470947,
"rewards/rejected": -22.150726318359375,
"step": 114
},
{
"epoch": 0.39603960396039606,
"grad_norm": 173.63128250872595,
"learning_rate": 6.041550648575234e-07,
"logits/chosen": -11.796028137207031,
"logits/rejected": -11.548486709594727,
"logps/chosen": -2.350860595703125,
"logps/rejected": -2.701869010925293,
"loss": 3.1346,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.50860595703125,
"rewards/margins": 3.510082721710205,
"rewards/rejected": -27.018688201904297,
"step": 115
},
{
"epoch": 0.3994834266035299,
"grad_norm": 138.3119351547436,
"learning_rate": 6e-07,
"logits/chosen": -12.655649185180664,
"logits/rejected": -12.318216323852539,
"logps/chosen": -1.7546292543411255,
"logps/rejected": -2.545380115509033,
"loss": 3.5043,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.54629135131836,
"rewards/margins": 7.907507419586182,
"rewards/rejected": -25.453800201416016,
"step": 116
},
{
"epoch": 0.4029272492466638,
"grad_norm": 145.98056778857588,
"learning_rate": 5.958159588055472e-07,
"logits/chosen": -13.69811725616455,
"logits/rejected": -13.62729549407959,
"logps/chosen": -1.6991206407546997,
"logps/rejected": -2.0454368591308594,
"loss": 3.1443,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.9912052154541,
"rewards/margins": 3.4631614685058594,
"rewards/rejected": -20.454364776611328,
"step": 117
},
{
"epoch": 0.40637107188979765,
"grad_norm": 169.50788721509392,
"learning_rate": 5.916035474651021e-07,
"logits/chosen": -13.291184425354004,
"logits/rejected": -13.162979125976562,
"logps/chosen": -1.7755848169326782,
"logps/rejected": -2.445830821990967,
"loss": 3.0084,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.755847930908203,
"rewards/margins": 6.702462196350098,
"rewards/rejected": -24.458311080932617,
"step": 118
},
{
"epoch": 0.4098148945329316,
"grad_norm": 119.7129501335904,
"learning_rate": 5.87363376279916e-07,
"logits/chosen": -12.720142364501953,
"logits/rejected": -12.485799789428711,
"logps/chosen": -1.9099136590957642,
"logps/rejected": -2.9828622341156006,
"loss": 2.6101,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.099138259887695,
"rewards/margins": 10.729487419128418,
"rewards/rejected": -29.828622817993164,
"step": 119
},
{
"epoch": 0.41325871717606544,
"grad_norm": 175.87119643186315,
"learning_rate": 5.830960595731334e-07,
"logits/chosen": -11.896202087402344,
"logits/rejected": -12.151188850402832,
"logps/chosen": -1.8780018091201782,
"logps/rejected": -2.5524849891662598,
"loss": 2.5416,
"rewards/accuracies": 0.75,
"rewards/chosen": -18.780019760131836,
"rewards/margins": 6.744830131530762,
"rewards/rejected": -25.52484893798828,
"step": 120
},
{
"epoch": 0.4167025398191993,
"grad_norm": 182.67604950430695,
"learning_rate": 5.788022156007876e-07,
"logits/chosen": -13.834617614746094,
"logits/rejected": -13.90433406829834,
"logps/chosen": -2.329464912414551,
"logps/rejected": -2.853301763534546,
"loss": 3.5812,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.294649124145508,
"rewards/margins": 5.238368034362793,
"rewards/rejected": -28.533016204833984,
"step": 121
},
{
"epoch": 0.42014636246233317,
"grad_norm": 162.94046075177278,
"learning_rate": 5.744824664622269e-07,
"logits/chosen": -13.56065559387207,
"logits/rejected": -13.065262794494629,
"logps/chosen": -2.3431150913238525,
"logps/rejected": -2.9342470169067383,
"loss": 2.9716,
"rewards/accuracies": 0.8125,
"rewards/chosen": -23.431150436401367,
"rewards/margins": 5.911318778991699,
"rewards/rejected": -29.342470169067383,
"step": 122
},
{
"epoch": 0.4235901851054671,
"grad_norm": 140.20048424042622,
"learning_rate": 5.70137438009984e-07,
"logits/chosen": -14.668344497680664,
"logits/rejected": -13.541962623596191,
"logps/chosen": -2.4308154582977295,
"logps/rejected": -2.9639768600463867,
"loss": 3.1536,
"rewards/accuracies": 0.6875,
"rewards/chosen": -24.308155059814453,
"rewards/margins": 5.331615447998047,
"rewards/rejected": -29.639768600463867,
"step": 123
},
{
"epoch": 0.42703400774860095,
"grad_norm": 271.24268423314203,
"learning_rate": 5.657677597591007e-07,
"logits/chosen": -14.41106128692627,
"logits/rejected": -14.61843204498291,
"logps/chosen": -2.38899564743042,
"logps/rejected": -2.7550418376922607,
"loss": 3.5358,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.889955520629883,
"rewards/margins": 3.660465717315674,
"rewards/rejected": -27.550418853759766,
"step": 124
},
{
"epoch": 0.4304778303917348,
"grad_norm": 165.17472401407244,
"learning_rate": 5.613740647959235e-07,
"logits/chosen": -12.676807403564453,
"logits/rejected": -12.471404075622559,
"logps/chosen": -1.8505709171295166,
"logps/rejected": -2.3470511436462402,
"loss": 2.8249,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.505708694458008,
"rewards/margins": 4.964802265167236,
"rewards/rejected": -23.47051239013672,
"step": 125
},
{
"epoch": 0.4339216530348687,
"grad_norm": 143.69505743500713,
"learning_rate": 5.569569896863801e-07,
"logits/chosen": -13.985774993896484,
"logits/rejected": -13.654581069946289,
"logps/chosen": -1.782606601715088,
"logps/rejected": -2.051421642303467,
"loss": 3.256,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.826065063476562,
"rewards/margins": 2.688152551651001,
"rewards/rejected": -20.514219284057617,
"step": 126
},
{
"epoch": 0.4373654756780026,
"grad_norm": 198.98779014050083,
"learning_rate": 5.52517174383754e-07,
"logits/chosen": -13.829938888549805,
"logits/rejected": -13.86230182647705,
"logps/chosen": -2.4392776489257812,
"logps/rejected": -3.190002679824829,
"loss": 2.8292,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.39277458190918,
"rewards/margins": 7.50724983215332,
"rewards/rejected": -31.900026321411133,
"step": 127
},
{
"epoch": 0.44080929832113647,
"grad_norm": 189.97902844787896,
"learning_rate": 5.480552621359659e-07,
"logits/chosen": -14.226242065429688,
"logits/rejected": -14.341365814208984,
"logps/chosen": -2.10856294631958,
"logps/rejected": -2.4918906688690186,
"loss": 3.2012,
"rewards/accuracies": 0.625,
"rewards/chosen": -21.085630416870117,
"rewards/margins": 3.8332767486572266,
"rewards/rejected": -24.918907165527344,
"step": 128
},
{
"epoch": 0.44425312096427033,
"grad_norm": 136.19908022128857,
"learning_rate": 5.435718993923784e-07,
"logits/chosen": -13.451090812683105,
"logits/rejected": -12.785599708557129,
"logps/chosen": -1.6525287628173828,
"logps/rejected": -2.3770031929016113,
"loss": 2.4949,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.52528953552246,
"rewards/margins": 7.244744300842285,
"rewards/rejected": -23.77003288269043,
"step": 129
},
{
"epoch": 0.4476969436074042,
"grad_norm": 135.82081198678648,
"learning_rate": 5.39067735710139e-07,
"logits/chosen": -14.511775016784668,
"logits/rejected": -13.7813138961792,
"logps/chosen": -2.127079963684082,
"logps/rejected": -2.742191791534424,
"loss": 2.8888,
"rewards/accuracies": 0.5625,
"rewards/chosen": -21.27079963684082,
"rewards/margins": 6.151117324829102,
"rewards/rejected": -27.42191505432129,
"step": 130
},
{
"epoch": 0.4511407662505381,
"grad_norm": 183.19436612857393,
"learning_rate": 5.3454342366007e-07,
"logits/chosen": -14.230147361755371,
"logits/rejected": -13.983600616455078,
"logps/chosen": -2.1134510040283203,
"logps/rejected": -2.4505257606506348,
"loss": 3.9961,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.134510040283203,
"rewards/margins": 3.3707499504089355,
"rewards/rejected": -24.505260467529297,
"step": 131
},
{
"epoch": 0.454584588893672,
"grad_norm": 207.54586277796528,
"learning_rate": 5.299996187321231e-07,
"logits/chosen": -15.675312042236328,
"logits/rejected": -15.437052726745605,
"logps/chosen": -1.9110677242279053,
"logps/rejected": -2.109048366546631,
"loss": 3.5684,
"rewards/accuracies": 0.5625,
"rewards/chosen": -19.110675811767578,
"rewards/margins": 1.9798049926757812,
"rewards/rejected": -21.090482711791992,
"step": 132
},
{
"epoch": 0.45802841153680585,
"grad_norm": 126.18602958529125,
"learning_rate": 5.254369792404108e-07,
"logits/chosen": -14.874656677246094,
"logits/rejected": -14.413407325744629,
"logps/chosen": -2.353257417678833,
"logps/rejected": -3.3936214447021484,
"loss": 2.0221,
"rewards/accuracies": 1.0,
"rewards/chosen": -23.532573699951172,
"rewards/margins": 10.403639793395996,
"rewards/rejected": -33.936214447021484,
"step": 133
},
{
"epoch": 0.4614722341799397,
"grad_norm": 143.02615992144584,
"learning_rate": 5.20856166227829e-07,
"logits/chosen": -15.980953216552734,
"logits/rejected": -15.589332580566406,
"logps/chosen": -2.683424472808838,
"logps/rejected": -3.0904102325439453,
"loss": 2.7952,
"rewards/accuracies": 0.75,
"rewards/chosen": -26.834243774414062,
"rewards/margins": 4.069858551025391,
"rewards/rejected": -30.904102325439453,
"step": 134
},
{
"epoch": 0.46491605682307363,
"grad_norm": 160.67163719835557,
"learning_rate": 5.162578433702844e-07,
"logits/chosen": -15.208805084228516,
"logits/rejected": -15.462963104248047,
"logps/chosen": -1.8320481777191162,
"logps/rejected": -2.23215651512146,
"loss": 2.9135,
"rewards/accuracies": 0.75,
"rewards/chosen": -18.320480346679688,
"rewards/margins": 4.001082420349121,
"rewards/rejected": -22.321565628051758,
"step": 135
},
{
"epoch": 0.4683598794662075,
"grad_norm": 145.99038790062517,
"learning_rate": 5.116426768805387e-07,
"logits/chosen": -14.624232292175293,
"logits/rejected": -14.728387832641602,
"logps/chosen": -2.1977291107177734,
"logps/rejected": -2.507412910461426,
"loss": 3.0741,
"rewards/accuracies": 0.8125,
"rewards/chosen": -21.977291107177734,
"rewards/margins": 3.096836566925049,
"rewards/rejected": -25.07413101196289,
"step": 136
},
{
"epoch": 0.47180370210934136,
"grad_norm": 148.12763051461735,
"learning_rate": 5.070113354116884e-07,
"logits/chosen": -15.4700927734375,
"logits/rejected": -15.196715354919434,
"logps/chosen": -1.5759204626083374,
"logps/rejected": -2.449571132659912,
"loss": 2.46,
"rewards/accuracies": 0.9375,
"rewards/chosen": -15.759203910827637,
"rewards/margins": 8.736505508422852,
"rewards/rejected": -24.495710372924805,
"step": 137
},
{
"epoch": 0.4752475247524752,
"grad_norm": 170.6076112934297,
"learning_rate": 5.023644899602871e-07,
"logits/chosen": -15.85372257232666,
"logits/rejected": -15.770110130310059,
"logps/chosen": -2.3490283489227295,
"logps/rejected": -2.6556615829467773,
"loss": 2.5731,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.490280151367188,
"rewards/margins": 3.066333293914795,
"rewards/rejected": -26.556615829467773,
"step": 138
},
{
"epoch": 0.47869134739560915,
"grad_norm": 160.9701483874417,
"learning_rate": 4.977028137691324e-07,
"logits/chosen": -14.690975189208984,
"logits/rejected": -14.133597373962402,
"logps/chosen": -2.249864101409912,
"logps/rejected": -2.9582080841064453,
"loss": 2.6498,
"rewards/accuracies": 0.9375,
"rewards/chosen": -22.498640060424805,
"rewards/margins": 7.083439826965332,
"rewards/rejected": -29.582080841064453,
"step": 139
},
{
"epoch": 0.482135170038743,
"grad_norm": 140.63079154740615,
"learning_rate": 4.930269822297241e-07,
"logits/chosen": -15.633464813232422,
"logits/rejected": -15.003985404968262,
"logps/chosen": -1.8792824745178223,
"logps/rejected": -2.4007444381713867,
"loss": 3.0883,
"rewards/accuracies": 0.8125,
"rewards/chosen": -18.79282569885254,
"rewards/margins": 5.214618682861328,
"rewards/rejected": -24.0074462890625,
"step": 140
},
{
"epoch": 0.4855789926818769,
"grad_norm": 164.90245104113825,
"learning_rate": 4.883376727844129e-07,
"logits/chosen": -17.06644058227539,
"logits/rejected": -16.71479034423828,
"logps/chosen": -1.984204888343811,
"logps/rejected": -2.5203452110290527,
"loss": 3.2745,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.842050552368164,
"rewards/margins": 5.361400127410889,
"rewards/rejected": -25.203449249267578,
"step": 141
},
{
"epoch": 0.48902281532501074,
"grad_norm": 158.4727771377469,
"learning_rate": 4.836355648282509e-07,
"logits/chosen": -15.427898406982422,
"logits/rejected": -15.480656623840332,
"logps/chosen": -1.6626648902893066,
"logps/rejected": -2.4167871475219727,
"loss": 2.6222,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.62664794921875,
"rewards/margins": 7.54122257232666,
"rewards/rejected": -24.16787338256836,
"step": 142
},
{
"epoch": 0.49246663796814466,
"grad_norm": 164.75096025896175,
"learning_rate": 4.7892133961056e-07,
"logits/chosen": -17.340797424316406,
"logits/rejected": -16.83738899230957,
"logps/chosen": -2.716911554336548,
"logps/rejected": -3.8089609146118164,
"loss": 3.2104,
"rewards/accuracies": 0.9375,
"rewards/chosen": -27.16911506652832,
"rewards/margins": 10.920495986938477,
"rewards/rejected": -38.08961486816406,
"step": 143
},
{
"epoch": 0.4959104606112785,
"grad_norm": 165.51865689779507,
"learning_rate": 4.7419568013623185e-07,
"logits/chosen": -17.844758987426758,
"logits/rejected": -17.37071418762207,
"logps/chosen": -2.3916749954223633,
"logps/rejected": -2.984015703201294,
"loss": 3.5064,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.916751861572266,
"rewards/margins": 5.923404693603516,
"rewards/rejected": -29.84015655517578,
"step": 144
},
{
"epoch": 0.4993542832544124,
"grad_norm": 170.81988641182852,
"learning_rate": 4.694592710667722e-07,
"logits/chosen": -16.56422996520996,
"logits/rejected": -16.64710235595703,
"logps/chosen": -1.9375088214874268,
"logps/rejected": -2.7873284816741943,
"loss": 2.599,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.375089645385742,
"rewards/margins": 8.498197555541992,
"rewards/rejected": -27.873287200927734,
"step": 145
},
{
"epoch": 0.5027981058975463,
"grad_norm": 152.83376641387204,
"learning_rate": 4.6471279862110594e-07,
"logits/chosen": -16.366130828857422,
"logits/rejected": -16.218612670898438,
"logps/chosen": -2.085256814956665,
"logps/rejected": -2.532078742980957,
"loss": 2.5765,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.852569580078125,
"rewards/margins": 4.468219757080078,
"rewards/rejected": -25.320789337158203,
"step": 146
},
{
"epoch": 0.5062419285406802,
"grad_norm": 167.03745018894716,
"learning_rate": 4.5995695047615724e-07,
"logits/chosen": -16.575876235961914,
"logits/rejected": -16.29088020324707,
"logps/chosen": -1.7308956384658813,
"logps/rejected": -2.0768227577209473,
"loss": 3.2235,
"rewards/accuracies": 0.75,
"rewards/chosen": -17.308956146240234,
"rewards/margins": 3.459270477294922,
"rewards/rejected": -20.768226623535156,
"step": 147
},
{
"epoch": 0.509685751183814,
"grad_norm": 178.30073567921698,
"learning_rate": 4.5519241566721724e-07,
"logits/chosen": -15.774458885192871,
"logits/rejected": -15.673702239990234,
"logps/chosen": -2.2769925594329834,
"logps/rejected": -2.522907257080078,
"loss": 3.6175,
"rewards/accuracies": 0.75,
"rewards/chosen": -22.769929885864258,
"rewards/margins": 2.45914363861084,
"rewards/rejected": -25.229076385498047,
"step": 148
},
{
"epoch": 0.5131295738269479,
"grad_norm": 137.31263512738002,
"learning_rate": 4.5041988448811574e-07,
"logits/chosen": -15.081258773803711,
"logits/rejected": -15.273090362548828,
"logps/chosen": -1.8598787784576416,
"logps/rejected": -2.1195287704467773,
"loss": 2.687,
"rewards/accuracies": 0.6875,
"rewards/chosen": -18.598787307739258,
"rewards/margins": 2.596500873565674,
"rewards/rejected": -21.195289611816406,
"step": 149
},
{
"epoch": 0.5165733964700818,
"grad_norm": 125.93526672957817,
"learning_rate": 4.456400483912099e-07,
"logits/chosen": -16.464996337890625,
"logits/rejected": -16.59218978881836,
"logps/chosen": -2.1750199794769287,
"logps/rejected": -2.6353604793548584,
"loss": 2.8572,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.750200271606445,
"rewards/margins": 4.603403568267822,
"rewards/rejected": -26.35360336303711,
"step": 150
},
{
"epoch": 0.5200172191132156,
"grad_norm": 142.20133174481876,
"learning_rate": 4.4085359988720583e-07,
"logits/chosen": -15.427270889282227,
"logits/rejected": -15.429370880126953,
"logps/chosen": -1.9908243417739868,
"logps/rejected": -2.447384834289551,
"loss": 2.1271,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.908245086669922,
"rewards/margins": 4.565605163574219,
"rewards/rejected": -24.473848342895508,
"step": 151
},
{
"epoch": 0.5234610417563496,
"grad_norm": 149.43131783531857,
"learning_rate": 4.3606123244482615e-07,
"logits/chosen": -16.817100524902344,
"logits/rejected": -16.41914176940918,
"logps/chosen": -2.2302825450897217,
"logps/rejected": -3.1961381435394287,
"loss": 2.7684,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.302825927734375,
"rewards/margins": 9.658554077148438,
"rewards/rejected": -31.961380004882812,
"step": 152
},
{
"epoch": 0.5269048643994835,
"grad_norm": 175.08007654534825,
"learning_rate": 4.3126364039033934e-07,
"logits/chosen": -16.285236358642578,
"logits/rejected": -16.360095977783203,
"logps/chosen": -1.9334690570831299,
"logps/rejected": -2.5686261653900146,
"loss": 2.9525,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.33469009399414,
"rewards/margins": 6.3515706062316895,
"rewards/rejected": -25.686262130737305,
"step": 153
},
{
"epoch": 0.5303486870426173,
"grad_norm": 147.77300608746174,
"learning_rate": 4.2646151880696466e-07,
"logits/chosen": -15.203396797180176,
"logits/rejected": -15.251398086547852,
"logps/chosen": -2.082613945007324,
"logps/rejected": -2.380194664001465,
"loss": 3.1857,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.826141357421875,
"rewards/margins": 2.9758081436157227,
"rewards/rejected": -23.80194854736328,
"step": 154
},
{
"epoch": 0.5337925096857512,
"grad_norm": 191.413155487678,
"learning_rate": 4.21655563434167e-07,
"logits/chosen": -16.2647762298584,
"logits/rejected": -16.06024742126465,
"logps/chosen": -1.8659639358520508,
"logps/rejected": -2.6368792057037354,
"loss": 2.8876,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.659639358520508,
"rewards/margins": 7.7091522216796875,
"rewards/rejected": -26.368793487548828,
"step": 155
},
{
"epoch": 0.537236332328885,
"grad_norm": 124.88412207047179,
"learning_rate": 4.16846470566857e-07,
"logits/chosen": -16.624813079833984,
"logits/rejected": -16.500511169433594,
"logps/chosen": -1.741891622543335,
"logps/rejected": -2.405266284942627,
"loss": 1.9418,
"rewards/accuracies": 0.8125,
"rewards/chosen": -17.418916702270508,
"rewards/margins": 6.633745193481445,
"rewards/rejected": -24.052661895751953,
"step": 156
},
{
"epoch": 0.5406801549720189,
"grad_norm": 141.77331157520445,
"learning_rate": 4.120349369545109e-07,
"logits/chosen": -15.149438858032227,
"logits/rejected": -15.287237167358398,
"logps/chosen": -2.173785448074341,
"logps/rejected": -3.180941104888916,
"loss": 2.7363,
"rewards/accuracies": 0.625,
"rewards/chosen": -21.73785400390625,
"rewards/margins": 10.071558952331543,
"rewards/rejected": -31.809410095214844,
"step": 157
},
{
"epoch": 0.5441239776151529,
"grad_norm": 155.44419089941894,
"learning_rate": 4.0722165970022414e-07,
"logits/chosen": -16.01889419555664,
"logits/rejected": -16.09550666809082,
"logps/chosen": -2.3958230018615723,
"logps/rejected": -2.5509209632873535,
"loss": 3.3508,
"rewards/accuracies": 0.5,
"rewards/chosen": -23.95823097229004,
"rewards/margins": 1.5509822368621826,
"rewards/rejected": -25.509214401245117,
"step": 158
},
{
"epoch": 0.5475678002582867,
"grad_norm": 129.29743296707403,
"learning_rate": 4.024073361597142e-07,
"logits/chosen": -17.30500030517578,
"logits/rejected": -16.847618103027344,
"logps/chosen": -2.4113364219665527,
"logps/rejected": -3.3326172828674316,
"loss": 2.5569,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.113361358642578,
"rewards/margins": 9.212811470031738,
"rewards/rejected": -33.326175689697266,
"step": 159
},
{
"epoch": 0.5510116229014206,
"grad_norm": 161.70130038708717,
"learning_rate": 3.9759266384028583e-07,
"logits/chosen": -15.621679306030273,
"logits/rejected": -15.098061561584473,
"logps/chosen": -2.271921157836914,
"logps/rejected": -2.7090096473693848,
"loss": 2.7771,
"rewards/accuracies": 0.625,
"rewards/chosen": -22.71921157836914,
"rewards/margins": 4.370884895324707,
"rewards/rejected": -27.090097427368164,
"step": 160
},
{
"epoch": 0.5544554455445545,
"grad_norm": 157.66640514865557,
"learning_rate": 3.927783402997757e-07,
"logits/chosen": -15.658122062683105,
"logits/rejected": -15.553414344787598,
"logps/chosen": -2.2297635078430176,
"logps/rejected": -2.9377989768981934,
"loss": 2.6828,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.29763412475586,
"rewards/margins": 7.080355644226074,
"rewards/rejected": -29.37799072265625,
"step": 161
},
{
"epoch": 0.5578992681876883,
"grad_norm": 135.88851128774647,
"learning_rate": 3.879650630454892e-07,
"logits/chosen": -16.659839630126953,
"logits/rejected": -16.298494338989258,
"logps/chosen": -2.3507156372070312,
"logps/rejected": -2.968364715576172,
"loss": 2.8013,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.50715446472168,
"rewards/margins": 6.176491737365723,
"rewards/rejected": -29.683645248413086,
"step": 162
},
{
"epoch": 0.5613430908308222,
"grad_norm": 113.40296909004411,
"learning_rate": 3.83153529433143e-07,
"logits/chosen": -14.723609924316406,
"logits/rejected": -14.707221984863281,
"logps/chosen": -2.070491313934326,
"logps/rejected": -2.8369667530059814,
"loss": 2.4633,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.704910278320312,
"rewards/margins": 7.664756774902344,
"rewards/rejected": -28.369670867919922,
"step": 163
},
{
"epoch": 0.564786913473956,
"grad_norm": 158.36069390792653,
"learning_rate": 3.78344436565833e-07,
"logits/chosen": -16.133251190185547,
"logits/rejected": -15.6480712890625,
"logps/chosen": -2.447453498840332,
"logps/rejected": -3.0984373092651367,
"loss": 2.7807,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.47453498840332,
"rewards/margins": 6.50984001159668,
"rewards/rejected": -30.984373092651367,
"step": 164
},
{
"epoch": 0.56823073611709,
"grad_norm": 144.30140846289407,
"learning_rate": 3.7353848119303536e-07,
"logits/chosen": -14.615021705627441,
"logits/rejected": -14.4873685836792,
"logps/chosen": -2.1710031032562256,
"logps/rejected": -3.150233030319214,
"loss": 2.3923,
"rewards/accuracies": 0.9375,
"rewards/chosen": -21.710033416748047,
"rewards/margins": 9.79229736328125,
"rewards/rejected": -31.502328872680664,
"step": 165
},
{
"epoch": 0.5716745587602239,
"grad_norm": 156.39504517806822,
"learning_rate": 3.687363596096607e-07,
"logits/chosen": -13.180891036987305,
"logits/rejected": -13.603325843811035,
"logps/chosen": -2.1853692531585693,
"logps/rejected": -2.7947943210601807,
"loss": 2.5098,
"rewards/accuracies": 0.8125,
"rewards/chosen": -21.85369300842285,
"rewards/margins": 6.09425163269043,
"rewards/rejected": -27.94794464111328,
"step": 166
},
{
"epoch": 0.5751183814033577,
"grad_norm": 189.2555316232527,
"learning_rate": 3.639387675551739e-07,
"logits/chosen": -16.535764694213867,
"logits/rejected": -16.17355728149414,
"logps/chosen": -2.0116500854492188,
"logps/rejected": -2.5565733909606934,
"loss": 2.8736,
"rewards/accuracies": 0.625,
"rewards/chosen": -20.11650276184082,
"rewards/margins": 5.449231147766113,
"rewards/rejected": -25.56573486328125,
"step": 167
},
{
"epoch": 0.5785622040464916,
"grad_norm": 168.96863729883424,
"learning_rate": 3.5914640011279424e-07,
"logits/chosen": -17.7143497467041,
"logits/rejected": -17.613689422607422,
"logps/chosen": -2.375505208969116,
"logps/rejected": -3.2843146324157715,
"loss": 1.8044,
"rewards/accuracies": 0.9375,
"rewards/chosen": -23.75505256652832,
"rewards/margins": 9.088095664978027,
"rewards/rejected": -32.84314727783203,
"step": 168
},
{
"epoch": 0.5820060266896255,
"grad_norm": 154.96333991867238,
"learning_rate": 3.543599516087901e-07,
"logits/chosen": -16.464033126831055,
"logits/rejected": -16.16067886352539,
"logps/chosen": -2.4366371631622314,
"logps/rejected": -2.9191558361053467,
"loss": 2.8393,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.366371154785156,
"rewards/margins": 4.825188636779785,
"rewards/rejected": -29.191558837890625,
"step": 169
},
{
"epoch": 0.5854498493327593,
"grad_norm": 133.1585711778442,
"learning_rate": 3.495801155118843e-07,
"logits/chosen": -17.312694549560547,
"logits/rejected": -17.00650978088379,
"logps/chosen": -2.205974817276001,
"logps/rejected": -2.8471574783325195,
"loss": 2.1623,
"rewards/accuracies": 1.0,
"rewards/chosen": -22.059749603271484,
"rewards/margins": 6.411825656890869,
"rewards/rejected": -28.471574783325195,
"step": 170
},
{
"epoch": 0.5888936719758933,
"grad_norm": 161.54792608348458,
"learning_rate": 3.448075843327827e-07,
"logits/chosen": -16.923572540283203,
"logits/rejected": -16.892681121826172,
"logps/chosen": -2.1601150035858154,
"logps/rejected": -2.8166677951812744,
"loss": 2.3029,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.601150512695312,
"rewards/margins": 6.56552791595459,
"rewards/rejected": -28.16668128967285,
"step": 171
},
{
"epoch": 0.5923374946190271,
"grad_norm": 147.52184027924474,
"learning_rate": 3.4004304952384283e-07,
"logits/chosen": -17.819061279296875,
"logits/rejected": -17.36820411682129,
"logps/chosen": -2.7700634002685547,
"logps/rejected": -3.871890068054199,
"loss": 2.3046,
"rewards/accuracies": 0.9375,
"rewards/chosen": -27.700634002685547,
"rewards/margins": 11.018264770507812,
"rewards/rejected": -38.71889877319336,
"step": 172
},
{
"epoch": 0.595781317262161,
"grad_norm": 133.0174741810263,
"learning_rate": 3.352872013788941e-07,
"logits/chosen": -15.862306594848633,
"logits/rejected": -15.657508850097656,
"logps/chosen": -1.8171809911727905,
"logps/rejected": -2.790759325027466,
"loss": 1.9991,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.171810150146484,
"rewards/margins": 9.735782623291016,
"rewards/rejected": -27.907590866088867,
"step": 173
},
{
"epoch": 0.5992251399052949,
"grad_norm": 140.03974045337986,
"learning_rate": 3.3054072893322785e-07,
"logits/chosen": -18.625810623168945,
"logits/rejected": -18.38481330871582,
"logps/chosen": -2.596587896347046,
"logps/rejected": -2.94968843460083,
"loss": 2.901,
"rewards/accuracies": 0.6875,
"rewards/chosen": -25.965877532958984,
"rewards/margins": 3.5310049057006836,
"rewards/rejected": -29.496883392333984,
"step": 174
},
{
"epoch": 0.6026689625484287,
"grad_norm": 183.98457603492201,
"learning_rate": 3.258043198637682e-07,
"logits/chosen": -14.411111831665039,
"logits/rejected": -14.562616348266602,
"logps/chosen": -2.5059261322021484,
"logps/rejected": -3.6475634574890137,
"loss": 2.2702,
"rewards/accuracies": 0.875,
"rewards/chosen": -25.059261322021484,
"rewards/margins": 11.41637134552002,
"rewards/rejected": -36.47563171386719,
"step": 175
},
{
"epoch": 0.6061127851915626,
"grad_norm": 213.58061636651394,
"learning_rate": 3.2107866038944004e-07,
"logits/chosen": -18.129159927368164,
"logits/rejected": -18.01100730895996,
"logps/chosen": -3.2717809677124023,
"logps/rejected": -3.870783805847168,
"loss": 2.6386,
"rewards/accuracies": 0.875,
"rewards/chosen": -32.71781539916992,
"rewards/margins": 5.990023612976074,
"rewards/rejected": -38.70783615112305,
"step": 176
},
{
"epoch": 0.6095566078346966,
"grad_norm": 179.1745395557081,
"learning_rate": 3.163644351717492e-07,
"logits/chosen": -17.755756378173828,
"logits/rejected": -17.911243438720703,
"logps/chosen": -2.3970510959625244,
"logps/rejected": -3.1361923217773438,
"loss": 2.4945,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.970510482788086,
"rewards/margins": 7.391412734985352,
"rewards/rejected": -31.36192512512207,
"step": 177
},
{
"epoch": 0.6130004304778304,
"grad_norm": 167.61011422532053,
"learning_rate": 3.1166232721558714e-07,
"logits/chosen": -17.961496353149414,
"logits/rejected": -18.155773162841797,
"logps/chosen": -2.3098530769348145,
"logps/rejected": -3.018498182296753,
"loss": 2.8513,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.098527908325195,
"rewards/margins": 7.086452484130859,
"rewards/rejected": -30.184980392456055,
"step": 178
},
{
"epoch": 0.6164442531209643,
"grad_norm": 166.57075495491398,
"learning_rate": 3.069730177702759e-07,
"logits/chosen": -16.301759719848633,
"logits/rejected": -16.399494171142578,
"logps/chosen": -1.7774579524993896,
"logps/rejected": -3.1513190269470215,
"loss": 2.6985,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.774578094482422,
"rewards/margins": 13.73861312866211,
"rewards/rejected": -31.51319122314453,
"step": 179
},
{
"epoch": 0.6198880757640981,
"grad_norm": 176.06415844976797,
"learning_rate": 3.022971862308676e-07,
"logits/chosen": -18.093582153320312,
"logits/rejected": -18.556367874145508,
"logps/chosen": -2.9991700649261475,
"logps/rejected": -3.220672845840454,
"loss": 3.4972,
"rewards/accuracies": 0.6875,
"rewards/chosen": -29.991701126098633,
"rewards/margins": 2.215024709701538,
"rewards/rejected": -32.20672607421875,
"step": 180
},
{
"epoch": 0.623331898407232,
"grad_norm": 165.59558701094664,
"learning_rate": 2.9763551003971285e-07,
"logits/chosen": -17.380640029907227,
"logits/rejected": -17.27768325805664,
"logps/chosen": -2.5434718132019043,
"logps/rejected": -3.3419389724731445,
"loss": 2.149,
"rewards/accuracies": 0.6875,
"rewards/chosen": -25.434715270996094,
"rewards/margins": 7.984673976898193,
"rewards/rejected": -33.41939163208008,
"step": 181
},
{
"epoch": 0.6267757210503659,
"grad_norm": 202.61030946129074,
"learning_rate": 2.929886645883117e-07,
"logits/chosen": -18.67276954650879,
"logits/rejected": -18.613061904907227,
"logps/chosen": -2.4041404724121094,
"logps/rejected": -3.169337749481201,
"loss": 3.206,
"rewards/accuracies": 0.875,
"rewards/chosen": -24.041404724121094,
"rewards/margins": 7.651971340179443,
"rewards/rejected": -31.693378448486328,
"step": 182
},
{
"epoch": 0.6302195436934998,
"grad_norm": 187.65675738898932,
"learning_rate": 2.883573231194613e-07,
"logits/chosen": -17.532236099243164,
"logits/rejected": -17.939956665039062,
"logps/chosen": -2.346766233444214,
"logps/rejected": -3.403451681137085,
"loss": 2.2531,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.46766471862793,
"rewards/margins": 10.566852569580078,
"rewards/rejected": -34.034515380859375,
"step": 183
},
{
"epoch": 0.6336633663366337,
"grad_norm": 195.90592847716476,
"learning_rate": 2.837421566297156e-07,
"logits/chosen": -17.615713119506836,
"logits/rejected": -17.56720542907715,
"logps/chosen": -2.333320140838623,
"logps/rejected": -2.91764760017395,
"loss": 3.0132,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.333202362060547,
"rewards/margins": 5.8432722091674805,
"rewards/rejected": -29.176475524902344,
"step": 184
},
{
"epoch": 0.6371071889797676,
"grad_norm": 183.34560091532057,
"learning_rate": 2.7914383377217083e-07,
"logits/chosen": -18.328622817993164,
"logits/rejected": -18.28988265991211,
"logps/chosen": -2.4505515098571777,
"logps/rejected": -3.1104087829589844,
"loss": 2.7728,
"rewards/accuracies": 0.6875,
"rewards/chosen": -24.50551414489746,
"rewards/margins": 6.598570823669434,
"rewards/rejected": -31.104084014892578,
"step": 185
},
{
"epoch": 0.6405510116229014,
"grad_norm": 143.6928758781207,
"learning_rate": 2.745630207595893e-07,
"logits/chosen": -17.590606689453125,
"logits/rejected": -17.842931747436523,
"logps/chosen": -2.3018007278442383,
"logps/rejected": -3.050802707672119,
"loss": 2.3707,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.018009185791016,
"rewards/margins": 7.490016937255859,
"rewards/rejected": -30.508028030395508,
"step": 186
},
{
"epoch": 0.6439948342660353,
"grad_norm": 184.0549446666674,
"learning_rate": 2.70000381267877e-07,
"logits/chosen": -17.26044273376465,
"logits/rejected": -16.797651290893555,
"logps/chosen": -2.5893783569335938,
"logps/rejected": -3.3172554969787598,
"loss": 2.5419,
"rewards/accuracies": 0.8125,
"rewards/chosen": -25.893783569335938,
"rewards/margins": 7.278769016265869,
"rewards/rejected": -33.17255401611328,
"step": 187
},
{
"epoch": 0.6474386569091691,
"grad_norm": 162.70714561559674,
"learning_rate": 2.654565763399299e-07,
"logits/chosen": -17.615966796875,
"logits/rejected": -17.202425003051758,
"logps/chosen": -2.06203556060791,
"logps/rejected": -2.5958409309387207,
"loss": 2.3888,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.62035369873047,
"rewards/margins": 5.338054180145264,
"rewards/rejected": -25.958410263061523,
"step": 188
},
{
"epoch": 0.650882479552303,
"grad_norm": 211.4855934809745,
"learning_rate": 2.6093226428986103e-07,
"logits/chosen": -17.890888214111328,
"logits/rejected": -18.181949615478516,
"logps/chosen": -2.5792300701141357,
"logps/rejected": -2.9869372844696045,
"loss": 3.2394,
"rewards/accuracies": 0.625,
"rewards/chosen": -25.792301177978516,
"rewards/margins": 4.077073574066162,
"rewards/rejected": -29.869373321533203,
"step": 189
},
{
"epoch": 0.654326302195437,
"grad_norm": 241.71964487552518,
"learning_rate": 2.564281006076217e-07,
"logits/chosen": -17.281450271606445,
"logits/rejected": -17.177658081054688,
"logps/chosen": -2.0290184020996094,
"logps/rejected": -2.531289577484131,
"loss": 2.9749,
"rewards/accuracies": 0.625,
"rewards/chosen": -20.29018211364746,
"rewards/margins": 5.022716999053955,
"rewards/rejected": -25.31290054321289,
"step": 190
},
{
"epoch": 0.6577701248385708,
"grad_norm": 140.2272151723766,
"learning_rate": 2.519447378640342e-07,
"logits/chosen": -18.04941749572754,
"logits/rejected": -17.695268630981445,
"logps/chosen": -2.407273292541504,
"logps/rejected": -3.1553428173065186,
"loss": 2.5175,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.07273292541504,
"rewards/margins": 7.480693340301514,
"rewards/rejected": -31.553424835205078,
"step": 191
},
{
"epoch": 0.6612139474817047,
"grad_norm": 122.61240060144814,
"learning_rate": 2.4748282561624587e-07,
"logits/chosen": -18.940771102905273,
"logits/rejected": -19.209871292114258,
"logps/chosen": -2.9477639198303223,
"logps/rejected": -3.3782527446746826,
"loss": 2.231,
"rewards/accuracies": 0.8125,
"rewards/chosen": -29.47764015197754,
"rewards/margins": 4.304885387420654,
"rewards/rejected": -33.78252410888672,
"step": 192
},
{
"epoch": 0.6646577701248386,
"grad_norm": 154.6658255017087,
"learning_rate": 2.4304301031361993e-07,
"logits/chosen": -17.39430046081543,
"logits/rejected": -17.464357376098633,
"logps/chosen": -1.6274338960647583,
"logps/rejected": -2.39015531539917,
"loss": 2.0168,
"rewards/accuracies": 0.8125,
"rewards/chosen": -16.274337768554688,
"rewards/margins": 7.627217769622803,
"rewards/rejected": -23.901554107666016,
"step": 193
},
{
"epoch": 0.6681015927679724,
"grad_norm": 183.06073957842517,
"learning_rate": 2.386259352040766e-07,
"logits/chosen": -17.165082931518555,
"logits/rejected": -17.03466796875,
"logps/chosen": -2.3020198345184326,
"logps/rejected": -3.141963481903076,
"loss": 2.5949,
"rewards/accuracies": 0.9375,
"rewards/chosen": -23.020198822021484,
"rewards/margins": 8.399435997009277,
"rewards/rejected": -31.419633865356445,
"step": 194
},
{
"epoch": 0.6715454154111064,
"grad_norm": 195.7328576465893,
"learning_rate": 2.3423224024089924e-07,
"logits/chosen": -16.67756462097168,
"logits/rejected": -16.06248664855957,
"logps/chosen": -1.9835630655288696,
"logps/rejected": -2.165980815887451,
"loss": 2.8892,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.83563232421875,
"rewards/margins": 1.8241767883300781,
"rewards/rejected": -21.659809112548828,
"step": 195
},
{
"epoch": 0.6749892380542402,
"grad_norm": 185.40100121088884,
"learning_rate": 2.2986256199001607e-07,
"logits/chosen": -16.720062255859375,
"logits/rejected": -17.210546493530273,
"logps/chosen": -2.1794826984405518,
"logps/rejected": -3.011854887008667,
"loss": 3.2752,
"rewards/accuracies": 0.9375,
"rewards/chosen": -21.79482650756836,
"rewards/margins": 8.323722839355469,
"rewards/rejected": -30.118549346923828,
"step": 196
},
{
"epoch": 0.6784330606973741,
"grad_norm": 169.01081696151877,
"learning_rate": 2.2551753353777298e-07,
"logits/chosen": -16.483049392700195,
"logits/rejected": -15.803775787353516,
"logps/chosen": -1.923852562904358,
"logps/rejected": -2.4508516788482666,
"loss": 2.4423,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.238523483276367,
"rewards/margins": 5.269989967346191,
"rewards/rejected": -24.508514404296875,
"step": 197
},
{
"epoch": 0.681876883340508,
"grad_norm": 184.28493562958423,
"learning_rate": 2.2119778439921243e-07,
"logits/chosen": -17.207120895385742,
"logits/rejected": -17.228199005126953,
"logps/chosen": -2.1767778396606445,
"logps/rejected": -2.8498196601867676,
"loss": 2.6463,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.767776489257812,
"rewards/margins": 6.730417251586914,
"rewards/rejected": -28.49819564819336,
"step": 198
},
{
"epoch": 0.6853207059836418,
"grad_norm": 185.34805729694065,
"learning_rate": 2.169039404268666e-07,
"logits/chosen": -15.101792335510254,
"logits/rejected": -15.015120506286621,
"logps/chosen": -1.9498858451843262,
"logps/rejected": -2.9745125770568848,
"loss": 2.4206,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.498859405517578,
"rewards/margins": 10.246267318725586,
"rewards/rejected": -29.745128631591797,
"step": 199
},
{
"epoch": 0.6887645286267757,
"grad_norm": 171.9250040897193,
"learning_rate": 2.1263662372008397e-07,
"logits/chosen": -17.20707893371582,
"logits/rejected": -17.392990112304688,
"logps/chosen": -2.209491491317749,
"logps/rejected": -3.219839334487915,
"loss": 2.3084,
"rewards/accuracies": 0.875,
"rewards/chosen": -22.094913482666016,
"rewards/margins": 10.103475570678711,
"rewards/rejected": -32.19839096069336,
"step": 200
},
{
"epoch": 0.6922083512699096,
"grad_norm": 134.1599441897618,
"learning_rate": 2.0839645253489785e-07,
"logits/chosen": -17.15797233581543,
"logits/rejected": -17.089859008789062,
"logps/chosen": -2.460911512374878,
"logps/rejected": -3.360152244567871,
"loss": 2.7262,
"rewards/accuracies": 0.875,
"rewards/chosen": -24.60911750793457,
"rewards/margins": 8.992408752441406,
"rewards/rejected": -33.601524353027344,
"step": 201
},
{
"epoch": 0.6956521739130435,
"grad_norm": 225.89869134193762,
"learning_rate": 2.0418404119445257e-07,
"logits/chosen": -18.76146697998047,
"logits/rejected": -18.73739242553711,
"logps/chosen": -2.1693127155303955,
"logps/rejected": -2.5993404388427734,
"loss": 2.7532,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.693126678466797,
"rewards/margins": 4.3002777099609375,
"rewards/rejected": -25.993404388427734,
"step": 202
},
{
"epoch": 0.6990959965561774,
"grad_norm": 168.18364451173605,
"learning_rate": 2.0000000000000007e-07,
"logits/chosen": -16.125919342041016,
"logits/rejected": -16.450841903686523,
"logps/chosen": -2.2542104721069336,
"logps/rejected": -2.954272985458374,
"loss": 2.3868,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.542104721069336,
"rewards/margins": 7.000626087188721,
"rewards/rejected": -29.542734146118164,
"step": 203
},
{
"epoch": 0.7025398191993112,
"grad_norm": 169.8912048470285,
"learning_rate": 1.9584493514247673e-07,
"logits/chosen": -15.31773567199707,
"logits/rejected": -15.480231285095215,
"logps/chosen": -2.3769216537475586,
"logps/rejected": -3.0659618377685547,
"loss": 2.829,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.769216537475586,
"rewards/margins": 6.890398979187012,
"rewards/rejected": -30.65961456298828,
"step": 204
},
{
"epoch": 0.7059836418424451,
"grad_norm": 164.25344840367887,
"learning_rate": 1.91719448614679e-07,
"logits/chosen": -18.408157348632812,
"logits/rejected": -18.19486427307129,
"logps/chosen": -2.329998016357422,
"logps/rejected": -2.998572587966919,
"loss": 2.6702,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.29998016357422,
"rewards/margins": 6.685744285583496,
"rewards/rejected": -29.9857234954834,
"step": 205
},
{
"epoch": 0.709427464485579,
"grad_norm": 160.2297179052625,
"learning_rate": 1.8762413812404537e-07,
"logits/chosen": -15.564806938171387,
"logits/rejected": -15.336395263671875,
"logps/chosen": -2.414278507232666,
"logps/rejected": -3.099297523498535,
"loss": 2.3344,
"rewards/accuracies": 0.875,
"rewards/chosen": -24.142784118652344,
"rewards/margins": 6.850188255310059,
"rewards/rejected": -30.992971420288086,
"step": 206
},
{
"epoch": 0.7128712871287128,
"grad_norm": 159.59891745599342,
"learning_rate": 1.8355959700605835e-07,
"logits/chosen": -16.28492546081543,
"logits/rejected": -15.95881462097168,
"logps/chosen": -2.8605947494506836,
"logps/rejected": -3.867330551147461,
"loss": 2.8207,
"rewards/accuracies": 0.9375,
"rewards/chosen": -28.60594367980957,
"rewards/margins": 10.067360877990723,
"rewards/rejected": -38.67330551147461,
"step": 207
},
{
"epoch": 0.7163151097718468,
"grad_norm": 138.3874486270341,
"learning_rate": 1.7952641413828285e-07,
"logits/chosen": -14.021824836730957,
"logits/rejected": -14.172553062438965,
"logps/chosen": -1.9960724115371704,
"logps/rejected": -2.703728199005127,
"loss": 2.7886,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.960723876953125,
"rewards/margins": 7.076559066772461,
"rewards/rejected": -27.037281036376953,
"step": 208
},
{
"epoch": 0.7197589324149807,
"grad_norm": 139.88886492945034,
"learning_rate": 1.755251738550471e-07,
"logits/chosen": -17.956972122192383,
"logits/rejected": -17.393238067626953,
"logps/chosen": -2.526200294494629,
"logps/rejected": -3.3326597213745117,
"loss": 2.5612,
"rewards/accuracies": 0.8125,
"rewards/chosen": -25.262001037597656,
"rewards/margins": 8.064594268798828,
"rewards/rejected": -33.32659149169922,
"step": 209
},
{
"epoch": 0.7232027550581145,
"grad_norm": 121.87663059806626,
"learning_rate": 1.7155645586278396e-07,
"logits/chosen": -16.801706314086914,
"logits/rejected": -17.29227638244629,
"logps/chosen": -2.4827024936676025,
"logps/rejected": -3.251868724822998,
"loss": 2.3086,
"rewards/accuracies": 1.0,
"rewards/chosen": -24.827022552490234,
"rewards/margins": 7.691664218902588,
"rewards/rejected": -32.5186882019043,
"step": 210
},
{
"epoch": 0.7266465777012484,
"grad_norm": 183.5598825145353,
"learning_rate": 1.6762083515604205e-07,
"logits/chosen": -16.376623153686523,
"logits/rejected": -16.853988647460938,
"logps/chosen": -2.1823861598968506,
"logps/rejected": -2.469951629638672,
"loss": 3.0611,
"rewards/accuracies": 0.625,
"rewards/chosen": -21.82386016845703,
"rewards/margins": 2.8756532669067383,
"rewards/rejected": -24.69951629638672,
"step": 211
},
{
"epoch": 0.7300904003443822,
"grad_norm": 140.6766155274216,
"learning_rate": 1.6371888193417962e-07,
"logits/chosen": -16.161523818969727,
"logits/rejected": -15.664430618286133,
"logps/chosen": -2.5164270401000977,
"logps/rejected": -3.5702481269836426,
"loss": 1.5852,
"rewards/accuracies": 0.8125,
"rewards/chosen": -25.16427230834961,
"rewards/margins": 10.538207054138184,
"rewards/rejected": -35.702476501464844,
"step": 212
},
{
"epoch": 0.7335342229875161,
"grad_norm": 145.77478214483807,
"learning_rate": 1.598511615187527e-07,
"logits/chosen": -16.284515380859375,
"logits/rejected": -15.916769027709961,
"logps/chosen": -1.8510792255401611,
"logps/rejected": -2.855922222137451,
"loss": 2.4349,
"rewards/accuracies": 0.8125,
"rewards/chosen": -18.51078987121582,
"rewards/margins": 10.048429489135742,
"rewards/rejected": -28.559221267700195,
"step": 213
},
{
"epoch": 0.7369780456306501,
"grad_norm": 165.3902758912234,
"learning_rate": 1.560182342716109e-07,
"logits/chosen": -16.97911834716797,
"logits/rejected": -17.0489444732666,
"logps/chosen": -2.855278253555298,
"logps/rejected": -3.1526296138763428,
"loss": 2.8624,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.552780151367188,
"rewards/margins": 2.9735140800476074,
"rewards/rejected": -31.526294708251953,
"step": 214
},
{
"epoch": 0.7404218682737839,
"grad_norm": 169.1752973988815,
"learning_rate": 1.5222065551371078e-07,
"logits/chosen": -16.086669921875,
"logits/rejected": -15.614689826965332,
"logps/chosen": -2.2419023513793945,
"logps/rejected": -2.8294296264648438,
"loss": 2.1771,
"rewards/accuracies": 0.875,
"rewards/chosen": -22.41902732849121,
"rewards/margins": 5.875270843505859,
"rewards/rejected": -28.294296264648438,
"step": 215
},
{
"epoch": 0.7438656909169178,
"grad_norm": 175.5809330835783,
"learning_rate": 1.4845897544466062e-07,
"logits/chosen": -15.881332397460938,
"logits/rejected": -16.038084030151367,
"logps/chosen": -1.938501238822937,
"logps/rejected": -2.6783177852630615,
"loss": 2.5021,
"rewards/accuracies": 0.875,
"rewards/chosen": -19.385011672973633,
"rewards/margins": 7.398165702819824,
"rewards/rejected": -26.783178329467773,
"step": 216
},
{
"epoch": 0.7473095135600517,
"grad_norm": 138.7906401630708,
"learning_rate": 1.4473373906300576e-07,
"logits/chosen": -14.417771339416504,
"logits/rejected": -14.370369911193848,
"logps/chosen": -1.7395800352096558,
"logps/rejected": -2.279273271560669,
"loss": 2.329,
"rewards/accuracies": 0.875,
"rewards/chosen": -17.39579963684082,
"rewards/margins": 5.396934986114502,
"rewards/rejected": -22.79273223876953,
"step": 217
},
{
"epoch": 0.7507533362031855,
"grad_norm": 177.5157013301922,
"learning_rate": 1.4104548608726895e-07,
"logits/chosen": -17.694894790649414,
"logits/rejected": -17.320873260498047,
"logps/chosen": -2.5375585556030273,
"logps/rejected": -3.9845151901245117,
"loss": 2.2671,
"rewards/accuracies": 0.9375,
"rewards/chosen": -25.375581741333008,
"rewards/margins": 14.46957015991211,
"rewards/rejected": -39.84515380859375,
"step": 218
},
{
"epoch": 0.7541971588463194,
"grad_norm": 134.9327740603631,
"learning_rate": 1.3739475087775466e-07,
"logits/chosen": -15.686095237731934,
"logits/rejected": -15.889823913574219,
"logps/chosen": -2.468545913696289,
"logps/rejected": -2.932441234588623,
"loss": 2.5443,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.685457229614258,
"rewards/margins": 4.638955116271973,
"rewards/rejected": -29.324413299560547,
"step": 219
},
{
"epoch": 0.7576409814894532,
"grad_norm": 135.47400869262708,
"learning_rate": 1.3378206235913028e-07,
"logits/chosen": -15.72805404663086,
"logits/rejected": -15.899078369140625,
"logps/chosen": -1.9772777557373047,
"logps/rejected": -2.4077036380767822,
"loss": 2.3468,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.77277946472168,
"rewards/margins": 4.304256916046143,
"rewards/rejected": -24.07703399658203,
"step": 220
},
{
"epoch": 0.7610848041325872,
"grad_norm": 162.24671967985978,
"learning_rate": 1.3020794394379447e-07,
"logits/chosen": -16.417884826660156,
"logits/rejected": -15.766003608703613,
"logps/chosen": -3.0476391315460205,
"logps/rejected": -4.174098491668701,
"loss": 2.7984,
"rewards/accuracies": 0.9375,
"rewards/chosen": -30.476388931274414,
"rewards/margins": 11.264592170715332,
"rewards/rejected": -41.74098205566406,
"step": 221
},
{
"epoch": 0.7645286267757211,
"grad_norm": 153.11579053938232,
"learning_rate": 1.2667291345604433e-07,
"logits/chosen": -16.348756790161133,
"logits/rejected": -16.798192977905273,
"logps/chosen": -2.2396018505096436,
"logps/rejected": -2.7667808532714844,
"loss": 2.4575,
"rewards/accuracies": 0.6875,
"rewards/chosen": -22.39601707458496,
"rewards/margins": 5.271792411804199,
"rewards/rejected": -27.66781234741211,
"step": 222
},
{
"epoch": 0.7679724494188549,
"grad_norm": 125.46348058673289,
"learning_rate": 1.2317748305705217e-07,
"logits/chosen": -17.994367599487305,
"logits/rejected": -18.30419921875,
"logps/chosen": -2.42231822013855,
"logps/rejected": -2.860621690750122,
"loss": 2.578,
"rewards/accuracies": 0.75,
"rewards/chosen": -24.223180770874023,
"rewards/margins": 4.383035659790039,
"rewards/rejected": -28.606216430664062,
"step": 223
},
{
"epoch": 0.7714162720619888,
"grad_norm": 130.3454994188522,
"learning_rate": 1.1972215917066307e-07,
"logits/chosen": -17.247783660888672,
"logits/rejected": -16.855878829956055,
"logps/chosen": -2.419665813446045,
"logps/rejected": -3.496854782104492,
"loss": 2.3658,
"rewards/accuracies": 0.875,
"rewards/chosen": -24.196657180786133,
"rewards/margins": 10.771889686584473,
"rewards/rejected": -34.968544006347656,
"step": 224
},
{
"epoch": 0.7748600947051227,
"grad_norm": 172.91319963893733,
"learning_rate": 1.1630744241002223e-07,
"logits/chosen": -17.969970703125,
"logits/rejected": -18.13035011291504,
"logps/chosen": -2.1760947704315186,
"logps/rejected": -2.7468371391296387,
"loss": 2.4968,
"rewards/accuracies": 0.8125,
"rewards/chosen": -21.760944366455078,
"rewards/margins": 5.707423210144043,
"rewards/rejected": -27.46837043762207,
"step": 225
},
{
"epoch": 0.7783039173482565,
"grad_norm": 113.76103569407015,
"learning_rate": 1.1293382750504688e-07,
"logits/chosen": -18.275222778320312,
"logits/rejected": -17.712696075439453,
"logps/chosen": -2.3341901302337646,
"logps/rejected": -3.101623773574829,
"loss": 2.1953,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.341899871826172,
"rewards/margins": 7.674335479736328,
"rewards/rejected": -31.016237258911133,
"step": 226
},
{
"epoch": 0.7817477399913905,
"grad_norm": 132.91068429588344,
"learning_rate": 1.0960180323074774e-07,
"logits/chosen": -18.44281578063965,
"logits/rejected": -18.48580551147461,
"logps/chosen": -2.326096296310425,
"logps/rejected": -3.35933518409729,
"loss": 1.9983,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.260963439941406,
"rewards/margins": 10.332388877868652,
"rewards/rejected": -33.593353271484375,
"step": 227
},
{
"epoch": 0.7851915626345243,
"grad_norm": 148.59038179673675,
"learning_rate": 1.0631185233641474e-07,
"logits/chosen": -18.724830627441406,
"logits/rejected": -18.60055923461914,
"logps/chosen": -2.2051844596862793,
"logps/rejected": -3.1807661056518555,
"loss": 2.2086,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.05184555053711,
"rewards/margins": 9.755819320678711,
"rewards/rejected": -31.80766487121582,
"step": 228
},
{
"epoch": 0.7886353852776582,
"grad_norm": 168.3363506588075,
"learning_rate": 1.0306445147567604e-07,
"logits/chosen": -16.910625457763672,
"logits/rejected": -17.089317321777344,
"logps/chosen": -2.1940903663635254,
"logps/rejected": -2.7529542446136475,
"loss": 3.2585,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.940902709960938,
"rewards/margins": 5.588636875152588,
"rewards/rejected": -27.529541015625,
"step": 229
},
{
"epoch": 0.7920792079207921,
"grad_norm": 140.0112579178903,
"learning_rate": 9.986007113743906e-08,
"logits/chosen": -17.158119201660156,
"logits/rejected": -17.688983917236328,
"logps/chosen": -2.0375654697418213,
"logps/rejected": -2.7925596237182617,
"loss": 2.1608,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.375656127929688,
"rewards/margins": 7.549938201904297,
"rewards/rejected": -27.925594329833984,
"step": 230
},
{
"epoch": 0.7955230305639259,
"grad_norm": 147.83687034828287,
"learning_rate": 9.669917557772542e-08,
"logits/chosen": -17.468488693237305,
"logits/rejected": -17.5322208404541,
"logps/chosen": -2.3444223403930664,
"logps/rejected": -2.8957479000091553,
"loss": 2.1804,
"rewards/accuracies": 0.8125,
"rewards/chosen": -23.44422149658203,
"rewards/margins": 5.513256549835205,
"rewards/rejected": -28.95747947692871,
"step": 231
},
{
"epoch": 0.7989668532070598,
"grad_norm": 182.36869206020535,
"learning_rate": 9.358222275240884e-08,
"logits/chosen": -17.69605255126953,
"logits/rejected": -17.339937210083008,
"logps/chosen": -2.6390442848205566,
"logps/rejected": -3.474766254425049,
"loss": 2.8717,
"rewards/accuracies": 0.8125,
"rewards/chosen": -26.39044189453125,
"rewards/margins": 8.357217788696289,
"rewards/rejected": -34.74766159057617,
"step": 232
},
{
"epoch": 0.8024106758501938,
"grad_norm": 253.44665519838688,
"learning_rate": 9.050966425086546e-08,
"logits/chosen": -17.90776252746582,
"logits/rejected": -18.13881492614746,
"logps/chosen": -2.3884530067443848,
"logps/rejected": -3.7100043296813965,
"loss": 3.4066,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.884531021118164,
"rewards/margins": 13.215514183044434,
"rewards/rejected": -37.10004425048828,
"step": 233
},
{
"epoch": 0.8058544984933276,
"grad_norm": 133.61246579427012,
"learning_rate": 8.748194523054748e-08,
"logits/chosen": -17.281970977783203,
"logits/rejected": -17.455997467041016,
"logps/chosen": -2.43415904045105,
"logps/rejected": -3.0494344234466553,
"loss": 1.9898,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.341590881347656,
"rewards/margins": 6.152754783630371,
"rewards/rejected": -30.49434471130371,
"step": 234
},
{
"epoch": 0.8092983211364615,
"grad_norm": 147.12423003196992,
"learning_rate": 8.449950435248676e-08,
"logits/chosen": -17.537288665771484,
"logits/rejected": -17.6503963470459,
"logps/chosen": -2.411252737045288,
"logps/rejected": -2.5560195446014404,
"loss": 2.959,
"rewards/accuracies": 0.625,
"rewards/chosen": -24.11252784729004,
"rewards/margins": 1.447667121887207,
"rewards/rejected": -25.56019401550293,
"step": 235
},
{
"epoch": 0.8127421437795953,
"grad_norm": 173.4351632725469,
"learning_rate": 8.15627737177425e-08,
"logits/chosen": -15.300152778625488,
"logits/rejected": -14.813128471374512,
"logps/chosen": -2.2649199962615967,
"logps/rejected": -3.06396222114563,
"loss": 3.1114,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.649198532104492,
"rewards/margins": 7.99042272567749,
"rewards/rejected": -30.63962173461914,
"step": 236
},
{
"epoch": 0.8161859664227292,
"grad_norm": 148.7657685969793,
"learning_rate": 7.867217880479629e-08,
"logits/chosen": -15.767210006713867,
"logits/rejected": -15.650933265686035,
"logps/chosen": -2.087116003036499,
"logps/rejected": -3.3104443550109863,
"loss": 2.4406,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.87116241455078,
"rewards/margins": 12.23327922821045,
"rewards/rejected": -33.10444259643555,
"step": 237
},
{
"epoch": 0.8196297890658631,
"grad_norm": 151.07749718687427,
"learning_rate": 7.582813840790847e-08,
"logits/chosen": -15.618885040283203,
"logits/rejected": -15.997750282287598,
"logps/chosen": -2.0786385536193848,
"logps/rejected": -2.7961952686309814,
"loss": 2.7965,
"rewards/accuracies": 0.6875,
"rewards/chosen": -20.786386489868164,
"rewards/margins": 7.175565719604492,
"rewards/rejected": -27.96195411682129,
"step": 238
},
{
"epoch": 0.823073611708997,
"grad_norm": 144.2174329571498,
"learning_rate": 7.303106457644328e-08,
"logits/chosen": -16.712963104248047,
"logits/rejected": -16.825284957885742,
"logps/chosen": -2.238386392593384,
"logps/rejected": -3.5630311965942383,
"loss": 2.8129,
"rewards/accuracies": 1.0,
"rewards/chosen": -22.383865356445312,
"rewards/margins": 13.246443748474121,
"rewards/rejected": -35.63031005859375,
"step": 239
},
{
"epoch": 0.8265174343521309,
"grad_norm": 169.73215342488882,
"learning_rate": 7.028136255516938e-08,
"logits/chosen": -18.084627151489258,
"logits/rejected": -18.091758728027344,
"logps/chosen": -3.0889220237731934,
"logps/rejected": -3.6106934547424316,
"loss": 2.5796,
"rewards/accuracies": 0.875,
"rewards/chosen": -30.88922119140625,
"rewards/margins": 5.217716693878174,
"rewards/rejected": -36.106937408447266,
"step": 240
},
{
"epoch": 0.8299612569952648,
"grad_norm": 149.39660539357982,
"learning_rate": 6.75794307255479e-08,
"logits/chosen": -17.41098976135254,
"logits/rejected": -17.147449493408203,
"logps/chosen": -2.440295457839966,
"logps/rejected": -3.6117165088653564,
"loss": 2.1983,
"rewards/accuracies": 0.9375,
"rewards/chosen": -24.4029541015625,
"rewards/margins": 11.714213371276855,
"rewards/rejected": -36.11716842651367,
"step": 241
},
{
"epoch": 0.8334050796383986,
"grad_norm": 154.41052222889476,
"learning_rate": 6.492566054801414e-08,
"logits/chosen": -17.25594139099121,
"logits/rejected": -17.346635818481445,
"logps/chosen": -2.8043160438537598,
"logps/rejected": -3.589071750640869,
"loss": 2.6675,
"rewards/accuracies": 0.9375,
"rewards/chosen": -28.04315948486328,
"rewards/margins": 7.847556114196777,
"rewards/rejected": -35.890716552734375,
"step": 242
},
{
"epoch": 0.8368489022815325,
"grad_norm": 144.1864772937289,
"learning_rate": 6.232043650526195e-08,
"logits/chosen": -19.001102447509766,
"logits/rejected": -19.054353713989258,
"logps/chosen": -2.5245139598846436,
"logps/rejected": -2.9145162105560303,
"loss": 2.6573,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.245140075683594,
"rewards/margins": 3.9000210762023926,
"rewards/rejected": -29.145160675048828,
"step": 243
},
{
"epoch": 0.8402927249246663,
"grad_norm": 159.15922893834966,
"learning_rate": 5.976413604653978e-08,
"logits/chosen": -16.62332534790039,
"logits/rejected": -17.053430557250977,
"logps/chosen": -2.617964267730713,
"logps/rejected": -3.058418035507202,
"loss": 2.3253,
"rewards/accuracies": 0.875,
"rewards/chosen": -26.179641723632812,
"rewards/margins": 4.404535293579102,
"rewards/rejected": -30.58417510986328,
"step": 244
},
{
"epoch": 0.8437365475678003,
"grad_norm": 140.0984307995632,
"learning_rate": 5.725712953296438e-08,
"logits/chosen": -15.821372985839844,
"logits/rejected": -15.742339134216309,
"logps/chosen": -1.8620578050613403,
"logps/rejected": -2.808138847351074,
"loss": 2.4966,
"rewards/accuracies": 0.9375,
"rewards/chosen": -18.62057876586914,
"rewards/margins": 9.460811614990234,
"rewards/rejected": -28.081388473510742,
"step": 245
},
{
"epoch": 0.8471803702109342,
"grad_norm": 147.6831357797836,
"learning_rate": 5.479978018386275e-08,
"logits/chosen": -17.423532485961914,
"logits/rejected": -17.499662399291992,
"logps/chosen": -2.267676591873169,
"logps/rejected": -2.954185962677002,
"loss": 2.4517,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.676767349243164,
"rewards/margins": 6.8650922775268555,
"rewards/rejected": -29.541858673095703,
"step": 246
},
{
"epoch": 0.850624192854068,
"grad_norm": 146.93865641824823,
"learning_rate": 5.23924440241486e-08,
"logits/chosen": -17.467391967773438,
"logits/rejected": -17.293798446655273,
"logps/chosen": -2.765316963195801,
"logps/rejected": -3.1737022399902344,
"loss": 3.1905,
"rewards/accuracies": 0.6875,
"rewards/chosen": -27.65317153930664,
"rewards/margins": 4.083853244781494,
"rewards/rejected": -31.737022399902344,
"step": 247
},
{
"epoch": 0.8540680154972019,
"grad_norm": 128.25480467364565,
"learning_rate": 5.003546983274014e-08,
"logits/chosen": -17.457887649536133,
"logits/rejected": -17.364229202270508,
"logps/chosen": -2.2277069091796875,
"logps/rejected": -3.1269993782043457,
"loss": 2.1543,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.277069091796875,
"rewards/margins": 8.992926597595215,
"rewards/rejected": -31.26999282836914,
"step": 248
},
{
"epoch": 0.8575118381403358,
"grad_norm": 126.61876835603796,
"learning_rate": 4.77291990920289e-08,
"logits/chosen": -15.818652153015137,
"logits/rejected": -15.359832763671875,
"logps/chosen": -1.8357523679733276,
"logps/rejected": -2.8696041107177734,
"loss": 2.5736,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.357524871826172,
"rewards/margins": 10.33851432800293,
"rewards/rejected": -28.69603729248047,
"step": 249
},
{
"epoch": 0.8609556607834696,
"grad_norm": 147.3951096860917,
"learning_rate": 4.5473965938405e-08,
"logits/chosen": -17.267982482910156,
"logits/rejected": -17.239002227783203,
"logps/chosen": -2.3899097442626953,
"logps/rejected": -3.6304116249084473,
"loss": 2.1633,
"rewards/accuracies": 0.9375,
"rewards/chosen": -23.899097442626953,
"rewards/margins": 12.405017852783203,
"rewards/rejected": -36.304115295410156,
"step": 250
},
{
"epoch": 0.8643994834266036,
"grad_norm": 131.8214642518433,
"learning_rate": 4.32700971138471e-08,
"logits/chosen": -16.42135238647461,
"logits/rejected": -17.10132598876953,
"logps/chosen": -2.0020933151245117,
"logps/rejected": -2.6545495986938477,
"loss": 2.1802,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.02093505859375,
"rewards/margins": 6.524560928344727,
"rewards/rejected": -26.54549789428711,
"step": 251
},
{
"epoch": 0.8678433060697374,
"grad_norm": 129.82185059004897,
"learning_rate": 4.11179119185832e-08,
"logits/chosen": -15.84195327758789,
"logits/rejected": -15.13234806060791,
"logps/chosen": -2.1187713146209717,
"logps/rejected": -3.074852228164673,
"loss": 2.0957,
"rewards/accuracies": 0.8125,
"rewards/chosen": -21.187713623046875,
"rewards/margins": 9.560807228088379,
"rewards/rejected": -30.748519897460938,
"step": 252
},
{
"epoch": 0.8712871287128713,
"grad_norm": 147.62415325752085,
"learning_rate": 3.9017722164830014e-08,
"logits/chosen": -15.684465408325195,
"logits/rejected": -15.583179473876953,
"logps/chosen": -2.2432422637939453,
"logps/rejected": -3.128464698791504,
"loss": 2.2196,
"rewards/accuracies": 0.8125,
"rewards/chosen": -22.432418823242188,
"rewards/margins": 8.852226257324219,
"rewards/rejected": -31.284645080566406,
"step": 253
},
{
"epoch": 0.8747309513560052,
"grad_norm": 129.59541911753672,
"learning_rate": 3.696983213161724e-08,
"logits/chosen": -16.10622215270996,
"logits/rejected": -15.97018814086914,
"logps/chosen": -2.268676519393921,
"logps/rejected": -3.3363349437713623,
"loss": 1.8693,
"rewards/accuracies": 0.9375,
"rewards/chosen": -22.686765670776367,
"rewards/margins": 10.676584243774414,
"rewards/rejected": -33.363346099853516,
"step": 254
},
{
"epoch": 0.878174773999139,
"grad_norm": 137.48943851483648,
"learning_rate": 3.4974538520702756e-08,
"logits/chosen": -14.73385238647461,
"logits/rejected": -14.685803413391113,
"logps/chosen": -2.011627197265625,
"logps/rejected": -2.8314733505249023,
"loss": 2.1826,
"rewards/accuracies": 0.875,
"rewards/chosen": -20.11627197265625,
"rewards/margins": 8.198460578918457,
"rewards/rejected": -28.314733505249023,
"step": 255
},
{
"epoch": 0.8816185966422729,
"grad_norm": 159.72968498568136,
"learning_rate": 3.303213041358628e-08,
"logits/chosen": -16.42586898803711,
"logits/rejected": -16.5064697265625,
"logps/chosen": -2.3388772010803223,
"logps/rejected": -3.104121208190918,
"loss": 2.3098,
"rewards/accuracies": 0.625,
"rewards/chosen": -23.388771057128906,
"rewards/margins": 7.652439117431641,
"rewards/rejected": -31.041210174560547,
"step": 256
},
{
"epoch": 0.8850624192854069,
"grad_norm": 139.996512233341,
"learning_rate": 3.114288922962673e-08,
"logits/chosen": -15.805761337280273,
"logits/rejected": -15.97050666809082,
"logps/chosen": -2.2719080448150635,
"logps/rejected": -2.8697736263275146,
"loss": 1.9082,
"rewards/accuracies": 0.75,
"rewards/chosen": -22.719078063964844,
"rewards/margins": 5.9786577224731445,
"rewards/rejected": -28.697734832763672,
"step": 257
},
{
"epoch": 0.8885062419285407,
"grad_norm": 142.0757665384917,
"learning_rate": 2.9307088685269544e-08,
"logits/chosen": -16.687719345092773,
"logits/rejected": -16.818946838378906,
"logps/chosen": -2.10107421875,
"logps/rejected": -2.945895195007324,
"loss": 2.0949,
"rewards/accuracies": 0.8125,
"rewards/chosen": -21.010740280151367,
"rewards/margins": 8.448209762573242,
"rewards/rejected": -29.45895004272461,
"step": 258
},
{
"epoch": 0.8919500645716746,
"grad_norm": 132.30438095070775,
"learning_rate": 2.7524994754390206e-08,
"logits/chosen": -18.812986373901367,
"logits/rejected": -18.67325210571289,
"logps/chosen": -2.727328300476074,
"logps/rejected": -3.1050448417663574,
"loss": 2.0166,
"rewards/accuracies": 0.8125,
"rewards/chosen": -27.27328109741211,
"rewards/margins": 3.7771661281585693,
"rewards/rejected": -31.050447463989258,
"step": 259
},
{
"epoch": 0.8953938872148084,
"grad_norm": 146.74697878065496,
"learning_rate": 2.5796865629759622e-08,
"logits/chosen": -16.785795211791992,
"logits/rejected": -16.06755256652832,
"logps/chosen": -2.190410852432251,
"logps/rejected": -3.289196014404297,
"loss": 2.7578,
"rewards/accuracies": 0.6875,
"rewards/chosen": -21.90410614013672,
"rewards/margins": 10.987853050231934,
"rewards/rejected": -32.89196014404297,
"step": 260
},
{
"epoch": 0.8988377098579423,
"grad_norm": 206.78249442348837,
"learning_rate": 2.4122951685636674e-08,
"logits/chosen": -16.686670303344727,
"logits/rejected": -16.18621826171875,
"logps/chosen": -2.611176013946533,
"logps/rejected": -3.921743869781494,
"loss": 3.0397,
"rewards/accuracies": 1.0,
"rewards/chosen": -26.11176109313965,
"rewards/margins": 13.10567855834961,
"rewards/rejected": -39.217437744140625,
"step": 261
},
{
"epoch": 0.9022815325010762,
"grad_norm": 1949.594124426928,
"learning_rate": 2.2503495441493503e-08,
"logits/chosen": -16.57187271118164,
"logits/rejected": -16.915918350219727,
"logps/chosen": -1.6403616666793823,
"logps/rejected": -2.541311740875244,
"loss": 2.0577,
"rewards/accuracies": 0.875,
"rewards/chosen": -16.40361785888672,
"rewards/margins": 9.009501457214355,
"rewards/rejected": -25.41312026977539,
"step": 262
},
{
"epoch": 0.90572535514421,
"grad_norm": 144.45425989954398,
"learning_rate": 2.093873152687906e-08,
"logits/chosen": -16.985397338867188,
"logits/rejected": -16.533916473388672,
"logps/chosen": -2.0635147094726562,
"logps/rejected": -3.0182905197143555,
"loss": 2.2249,
"rewards/accuracies": 1.0,
"rewards/chosen": -20.635149002075195,
"rewards/margins": 9.547759056091309,
"rewards/rejected": -30.18290901184082,
"step": 263
},
{
"epoch": 0.909169177787344,
"grad_norm": 161.70566070216898,
"learning_rate": 1.9428886647425214e-08,
"logits/chosen": -18.097816467285156,
"logits/rejected": -17.79208755493164,
"logps/chosen": -2.152082681655884,
"logps/rejected": -2.9253194332122803,
"loss": 2.632,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.52082633972168,
"rewards/margins": 7.732368469238281,
"rewards/rejected": -29.253196716308594,
"step": 264
},
{
"epoch": 0.9126130004304779,
"grad_norm": 129.93371206599926,
"learning_rate": 1.7974179552001866e-08,
"logits/chosen": -17.505081176757812,
"logits/rejected": -17.78946304321289,
"logps/chosen": -2.234717845916748,
"logps/rejected": -2.539454936981201,
"loss": 2.4342,
"rewards/accuracies": 0.6875,
"rewards/chosen": -22.347179412841797,
"rewards/margins": 3.0473670959472656,
"rewards/rejected": -25.394546508789062,
"step": 265
},
{
"epoch": 0.9160568230736117,
"grad_norm": 151.94990436818313,
"learning_rate": 1.6574821001023474e-08,
"logits/chosen": -18.333892822265625,
"logits/rejected": -18.197607040405273,
"logps/chosen": -1.9889158010482788,
"logps/rejected": -2.735637903213501,
"loss": 2.2676,
"rewards/accuracies": 0.9375,
"rewards/chosen": -19.889158248901367,
"rewards/margins": 7.467221260070801,
"rewards/rejected": -27.356380462646484,
"step": 266
},
{
"epoch": 0.9195006457167456,
"grad_norm": 157.25320445860555,
"learning_rate": 1.5231013735914444e-08,
"logits/chosen": -16.340839385986328,
"logits/rejected": -16.46707534790039,
"logps/chosen": -2.3339834213256836,
"logps/rejected": -2.750584840774536,
"loss": 2.4662,
"rewards/accuracies": 0.5625,
"rewards/chosen": -23.339832305908203,
"rewards/margins": 4.166016101837158,
"rewards/rejected": -27.50585174560547,
"step": 267
},
{
"epoch": 0.9229444683598794,
"grad_norm": 153.71273684620283,
"learning_rate": 1.3942952449735201e-08,
"logits/chosen": -17.994409561157227,
"logits/rejected": -17.976512908935547,
"logps/chosen": -2.3722710609436035,
"logps/rejected": -3.688662528991699,
"loss": 2.6692,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.722707748413086,
"rewards/margins": 13.163920402526855,
"rewards/rejected": -36.886627197265625,
"step": 268
},
{
"epoch": 0.9263882910030133,
"grad_norm": 159.9085920533978,
"learning_rate": 1.2710823758974676e-08,
"logits/chosen": -17.915321350097656,
"logits/rejected": -17.735774993896484,
"logps/chosen": -2.341430425643921,
"logps/rejected": -3.303116798400879,
"loss": 2.5158,
"rewards/accuracies": 0.8125,
"rewards/chosen": -23.414304733276367,
"rewards/margins": 9.616860389709473,
"rewards/rejected": -33.031166076660156,
"step": 269
},
{
"epoch": 0.9298321136461473,
"grad_norm": 173.1432244469156,
"learning_rate": 1.1534806176513434e-08,
"logits/chosen": -17.020999908447266,
"logits/rejected": -16.834182739257812,
"logps/chosen": -3.0662012100219727,
"logps/rejected": -4.367783546447754,
"loss": 2.8725,
"rewards/accuracies": 0.75,
"rewards/chosen": -30.66200828552246,
"rewards/margins": 13.01583194732666,
"rewards/rejected": -43.67784118652344,
"step": 270
},
{
"epoch": 0.9332759362892811,
"grad_norm": 175.82982939834184,
"learning_rate": 1.0415070085759925e-08,
"logits/chosen": -18.364770889282227,
"logits/rejected": -17.562824249267578,
"logps/chosen": -1.9744817018508911,
"logps/rejected": -2.404961347579956,
"loss": 2.7178,
"rewards/accuracies": 0.9375,
"rewards/chosen": -19.744815826416016,
"rewards/margins": 4.304797172546387,
"rewards/rejected": -24.049612045288086,
"step": 271
},
{
"epoch": 0.936719758932415,
"grad_norm": 138.65454205055815,
"learning_rate": 9.351777715965337e-09,
"logits/chosen": -18.179439544677734,
"logits/rejected": -18.3562068939209,
"logps/chosen": -2.5122263431549072,
"logps/rejected": -2.9324097633361816,
"loss": 2.0112,
"rewards/accuracies": 0.625,
"rewards/chosen": -25.122264862060547,
"rewards/margins": 4.2018327713012695,
"rewards/rejected": -29.3240966796875,
"step": 272
},
{
"epoch": 0.9401635815755489,
"grad_norm": 166.37792305846952,
"learning_rate": 8.345083118719509e-09,
"logits/chosen": -17.025907516479492,
"logits/rejected": -16.50684928894043,
"logps/chosen": -2.3154404163360596,
"logps/rejected": -3.7841498851776123,
"loss": 2.4035,
"rewards/accuracies": 0.875,
"rewards/chosen": -23.154401779174805,
"rewards/margins": 14.68709659576416,
"rewards/rejected": -37.84150314331055,
"step": 273
},
{
"epoch": 0.9436074042186827,
"grad_norm": 156.2441873918835,
"learning_rate": 7.395132145631544e-09,
"logits/chosen": -16.066808700561523,
"logits/rejected": -15.90978717803955,
"logps/chosen": -1.8371270895004272,
"logps/rejected": -2.678525924682617,
"loss": 2.0809,
"rewards/accuracies": 0.875,
"rewards/chosen": -18.37127113342285,
"rewards/margins": 8.413987159729004,
"rewards/rejected": -26.785259246826172,
"step": 274
},
{
"epoch": 0.9470512268618166,
"grad_norm": 153.24700496665506,
"learning_rate": 6.502062427198929e-09,
"logits/chosen": -17.553876876831055,
"logits/rejected": -17.5835018157959,
"logps/chosen": -2.4539918899536133,
"logps/rejected": -3.212017774581909,
"loss": 2.8158,
"rewards/accuracies": 0.875,
"rewards/chosen": -24.539915084838867,
"rewards/margins": 7.580263137817383,
"rewards/rejected": -32.12017822265625,
"step": 275
},
{
"epoch": 0.9504950495049505,
"grad_norm": 143.41309705270993,
"learning_rate": 5.666003352866733e-09,
"logits/chosen": -16.96306610107422,
"logits/rejected": -17.010374069213867,
"logps/chosen": -2.3547351360321045,
"logps/rejected": -3.560419797897339,
"loss": 2.1713,
"rewards/accuracies": 1.0,
"rewards/chosen": -23.547353744506836,
"rewards/margins": 12.056845664978027,
"rewards/rejected": -35.60419845581055,
"step": 276
},
{
"epoch": 0.9539388721480844,
"grad_norm": 127.28128452614312,
"learning_rate": 4.887076052282291e-09,
"logits/chosen": -16.996347427368164,
"logits/rejected": -17.045413970947266,
"logps/chosen": -2.568338394165039,
"logps/rejected": -3.7449233531951904,
"loss": 2.2934,
"rewards/accuracies": 0.75,
"rewards/chosen": -25.68338394165039,
"rewards/margins": 11.765849113464355,
"rewards/rejected": -37.44923400878906,
"step": 277
},
{
"epoch": 0.9573826947912183,
"grad_norm": 157.32940289604164,
"learning_rate": 4.165393377745108e-09,
"logits/chosen": -15.926298141479492,
"logits/rejected": -15.70862102508545,
"logps/chosen": -2.419119358062744,
"logps/rejected": -3.12341046333313,
"loss": 2.6577,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.191190719604492,
"rewards/margins": 7.042915344238281,
"rewards/rejected": -31.234106063842773,
"step": 278
},
{
"epoch": 0.9608265174343521,
"grad_norm": 166.13315912995816,
"learning_rate": 3.5010598878567387e-09,
"logits/chosen": -17.470443725585938,
"logits/rejected": -17.031620025634766,
"logps/chosen": -2.174730062484741,
"logps/rejected": -2.8888866901397705,
"loss": 2.6471,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.747299194335938,
"rewards/margins": 7.1415696144104,
"rewards/rejected": -28.888866424560547,
"step": 279
},
{
"epoch": 0.964270340077486,
"grad_norm": 147.30485634603158,
"learning_rate": 2.8941718323724605e-09,
"logits/chosen": -18.49142837524414,
"logits/rejected": -18.490829467773438,
"logps/chosen": -2.174673080444336,
"logps/rejected": -2.49884295463562,
"loss": 2.4879,
"rewards/accuracies": 0.625,
"rewards/chosen": -21.746734619140625,
"rewards/margins": 3.241696357727051,
"rewards/rejected": -24.98843002319336,
"step": 280
},
{
"epoch": 0.9677141627206199,
"grad_norm": 164.2306340847377,
"learning_rate": 2.344817138256161e-09,
"logits/chosen": -16.280027389526367,
"logits/rejected": -17.136314392089844,
"logps/chosen": -2.1512930393218994,
"logps/rejected": -3.025418519973755,
"loss": 2.543,
"rewards/accuracies": 0.875,
"rewards/chosen": -21.512928009033203,
"rewards/margins": 8.741255760192871,
"rewards/rejected": -30.254182815551758,
"step": 281
},
{
"epoch": 0.9711579853637538,
"grad_norm": 150.11832572590862,
"learning_rate": 1.8530753969413282e-09,
"logits/chosen": -18.478666305541992,
"logits/rejected": -18.377622604370117,
"logps/chosen": -1.8686823844909668,
"logps/rejected": -2.416599750518799,
"loss": 2.3333,
"rewards/accuracies": 0.8125,
"rewards/chosen": -18.686824798583984,
"rewards/margins": 5.479173183441162,
"rewards/rejected": -24.165996551513672,
"step": 282
},
{
"epoch": 0.9746018080068877,
"grad_norm": 148.19843248369733,
"learning_rate": 1.4190178527999198e-09,
"logits/chosen": -18.65854263305664,
"logits/rejected": -18.510950088500977,
"logps/chosen": -2.684150457382202,
"logps/rejected": -3.3109562397003174,
"loss": 2.5508,
"rewards/accuracies": 0.75,
"rewards/chosen": -26.84150505065918,
"rewards/margins": 6.268057823181152,
"rewards/rejected": -33.109561920166016,
"step": 283
},
{
"epoch": 0.9780456306500215,
"grad_norm": 145.3408171333848,
"learning_rate": 1.0427073928200857e-09,
"logits/chosen": -16.74791717529297,
"logits/rejected": -17.11324119567871,
"logps/chosen": -1.9323251247406006,
"logps/rejected": -2.5761024951934814,
"loss": 2.6734,
"rewards/accuracies": 0.8125,
"rewards/chosen": -19.32324981689453,
"rewards/margins": 6.437774658203125,
"rewards/rejected": -25.761024475097656,
"step": 284
},
{
"epoch": 0.9814894532931554,
"grad_norm": 157.29447155335754,
"learning_rate": 7.241985374952797e-10,
"logits/chosen": -15.43560791015625,
"logits/rejected": -16.016206741333008,
"logps/chosen": -2.186183452606201,
"logps/rejected": -2.557718515396118,
"loss": 2.6831,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.86183738708496,
"rewards/margins": 3.7153472900390625,
"rewards/rejected": -25.57718276977539,
"step": 285
},
{
"epoch": 0.9849332759362893,
"grad_norm": 169.85671811117746,
"learning_rate": 4.6353743292497637e-10,
"logits/chosen": -17.268774032592773,
"logits/rejected": -17.37803077697754,
"logps/chosen": -2.315028429031372,
"logps/rejected": -2.8562231063842773,
"loss": 3.1469,
"rewards/accuracies": 0.75,
"rewards/chosen": -23.150283813476562,
"rewards/margins": 5.411944389343262,
"rewards/rejected": -28.56222915649414,
"step": 286
},
{
"epoch": 0.9883770985794231,
"grad_norm": 134.22079815084547,
"learning_rate": 2.607618441292203e-10,
"logits/chosen": -17.780881881713867,
"logits/rejected": -17.691925048828125,
"logps/chosen": -2.1940627098083496,
"logps/rejected": -2.694575786590576,
"loss": 2.1033,
"rewards/accuracies": 0.75,
"rewards/chosen": -21.940628051757812,
"rewards/margins": 5.005130290985107,
"rewards/rejected": -26.945756912231445,
"step": 287
},
{
"epoch": 0.991820921222557,
"grad_norm": 159.97180537715076,
"learning_rate": 1.1590114957682473e-10,
"logits/chosen": -18.85245704650879,
"logits/rejected": -18.812564849853516,
"logps/chosen": -2.036210536956787,
"logps/rejected": -2.694965362548828,
"loss": 2.2158,
"rewards/accuracies": 0.8125,
"rewards/chosen": -20.362106323242188,
"rewards/margins": 6.58754825592041,
"rewards/rejected": -26.94965362548828,
"step": 288
},
{
"epoch": 0.995264743865691,
"grad_norm": 170.03207418013815,
"learning_rate": 2.8976336929353863e-11,
"logits/chosen": -17.584726333618164,
"logits/rejected": -17.696880340576172,
"logps/chosen": -2.0683584213256836,
"logps/rejected": -2.683166980743408,
"loss": 2.4093,
"rewards/accuracies": 0.75,
"rewards/chosen": -20.683582305908203,
"rewards/margins": 6.1480865478515625,
"rewards/rejected": -26.8316707611084,
"step": 289
},
{
"epoch": 0.9987085665088248,
"grad_norm": 122.7613085104077,
"learning_rate": 0.0,
"logits/chosen": -18.123817443847656,
"logits/rejected": -18.210723876953125,
"logps/chosen": -2.4845879077911377,
"logps/rejected": -3.084573268890381,
"loss": 2.7978,
"rewards/accuracies": 0.8125,
"rewards/chosen": -24.84588050842285,
"rewards/margins": 5.999850273132324,
"rewards/rejected": -30.845731735229492,
"step": 290
},
{
"epoch": 0.9987085665088248,
"step": 290,
"total_flos": 0.0,
"train_loss": 3.2915380025732106,
"train_runtime": 46073.3913,
"train_samples_per_second": 0.807,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 290,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}