martimfasantos's picture
Model save
34f0fcf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9964868029907215,
"eval_steps": 800,
"global_step": 2079,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014413115935501305,
"grad_norm": 21.287893295288086,
"learning_rate": 2.403846153846154e-09,
"logits/chosen": -2.3065450191497803,
"logits/rejected": -2.3093364238739014,
"logps/chosen": -43.837303161621094,
"logps/rejected": -48.05693054199219,
"loss": 0.6927,
"rewards/accuracies": 0.0625,
"rewards/chosen": 9.900308214128017e-06,
"rewards/margins": 0.0009647191036492586,
"rewards/rejected": -0.0009548187954351306,
"step": 1
},
{
"epoch": 0.014413115935501306,
"grad_norm": 21.087011337280273,
"learning_rate": 2.403846153846154e-08,
"logits/chosen": -2.3277149200439453,
"logits/rejected": -2.3011789321899414,
"logps/chosen": -42.81745910644531,
"logps/rejected": -44.89339065551758,
"loss": 0.6926,
"rewards/accuracies": 0.4583333432674408,
"rewards/chosen": -0.001182637526653707,
"rewards/margins": 0.0011362915392965078,
"rewards/rejected": -0.0023189291823655367,
"step": 10
},
{
"epoch": 0.02882623187100261,
"grad_norm": 19.580371856689453,
"learning_rate": 4.807692307692308e-08,
"logits/chosen": -2.2883663177490234,
"logits/rejected": -2.2757415771484375,
"logps/chosen": -45.45596694946289,
"logps/rejected": -48.15468978881836,
"loss": 0.6946,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.0008312638965435326,
"rewards/margins": -0.0028335480019450188,
"rewards/rejected": 0.0020022839307785034,
"step": 20
},
{
"epoch": 0.04323934780650392,
"grad_norm": 26.840009689331055,
"learning_rate": 7.21153846153846e-08,
"logits/chosen": -2.315314531326294,
"logits/rejected": -2.3029096126556396,
"logps/chosen": -46.84910202026367,
"logps/rejected": -48.4326286315918,
"loss": 0.6924,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.00018312002066522837,
"rewards/margins": 0.0015712290769442916,
"rewards/rejected": -0.00175434909760952,
"step": 30
},
{
"epoch": 0.05765246374200522,
"grad_norm": 22.58620834350586,
"learning_rate": 9.615384615384616e-08,
"logits/chosen": -2.347716808319092,
"logits/rejected": -2.338416576385498,
"logps/chosen": -50.591617584228516,
"logps/rejected": -52.742095947265625,
"loss": 0.6927,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0013993385946378112,
"rewards/margins": 0.0010021533817052841,
"rewards/rejected": 0.0003971853293478489,
"step": 40
},
{
"epoch": 0.07206557967750653,
"grad_norm": 23.823856353759766,
"learning_rate": 1.2019230769230769e-07,
"logits/chosen": -2.329172134399414,
"logits/rejected": -2.3224873542785645,
"logps/chosen": -47.3341178894043,
"logps/rejected": -49.947471618652344,
"loss": 0.6924,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.002387039829045534,
"rewards/margins": 0.0015624122461304069,
"rewards/rejected": 0.0008246281067840755,
"step": 50
},
{
"epoch": 0.08647869561300783,
"grad_norm": 18.55199432373047,
"learning_rate": 1.442307692307692e-07,
"logits/chosen": -2.3057174682617188,
"logits/rejected": -2.287588596343994,
"logps/chosen": -46.57988739013672,
"logps/rejected": -48.87944793701172,
"loss": 0.6927,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.001985303359106183,
"rewards/margins": 0.0009407905163243413,
"rewards/rejected": 0.00104451272636652,
"step": 60
},
{
"epoch": 0.10089181154850914,
"grad_norm": 18.088035583496094,
"learning_rate": 1.6826923076923077e-07,
"logits/chosen": -2.3419766426086426,
"logits/rejected": -2.3270087242126465,
"logps/chosen": -47.5944709777832,
"logps/rejected": -50.76883316040039,
"loss": 0.6919,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0018801375990733504,
"rewards/margins": 0.0026488774456083775,
"rewards/rejected": -0.0007687400793656707,
"step": 70
},
{
"epoch": 0.11530492748401044,
"grad_norm": 18.39251708984375,
"learning_rate": 1.9230769230769231e-07,
"logits/chosen": -2.335756778717041,
"logits/rejected": -2.3095576763153076,
"logps/chosen": -44.72612380981445,
"logps/rejected": -48.02496337890625,
"loss": 0.6894,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.006220139563083649,
"rewards/margins": 0.007775151636451483,
"rewards/rejected": -0.001555012189783156,
"step": 80
},
{
"epoch": 0.12971804341951176,
"grad_norm": 18.626569747924805,
"learning_rate": 2.1634615384615386e-07,
"logits/chosen": -2.3171792030334473,
"logits/rejected": -2.291064500808716,
"logps/chosen": -44.88652420043945,
"logps/rejected": -46.83210372924805,
"loss": 0.6899,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": 0.008590105921030045,
"rewards/margins": 0.00671065878123045,
"rewards/rejected": 0.0018794465577229857,
"step": 90
},
{
"epoch": 0.14413115935501306,
"grad_norm": 23.2231388092041,
"learning_rate": 2.4038461538461537e-07,
"logits/chosen": -2.383881092071533,
"logits/rejected": -2.377704620361328,
"logps/chosen": -42.710289001464844,
"logps/rejected": -46.196533203125,
"loss": 0.6891,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": 0.014623338356614113,
"rewards/margins": 0.008355258964002132,
"rewards/rejected": 0.006268080323934555,
"step": 100
},
{
"epoch": 0.15854427529051437,
"grad_norm": 19.625394821166992,
"learning_rate": 2.6442307692307694e-07,
"logits/chosen": -2.310715436935425,
"logits/rejected": -2.3002336025238037,
"logps/chosen": -45.03856658935547,
"logps/rejected": -47.96285629272461,
"loss": 0.6855,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": 0.01648247428238392,
"rewards/margins": 0.015830885618925095,
"rewards/rejected": 0.0006515888380818069,
"step": 110
},
{
"epoch": 0.17295739122601567,
"grad_norm": 22.700777053833008,
"learning_rate": 2.884615384615384e-07,
"logits/chosen": -2.339622974395752,
"logits/rejected": -2.326411485671997,
"logps/chosen": -46.59340286254883,
"logps/rejected": -49.68640899658203,
"loss": 0.6852,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.020804349333047867,
"rewards/margins": 0.016823848709464073,
"rewards/rejected": 0.003980500157922506,
"step": 120
},
{
"epoch": 0.18737050716151699,
"grad_norm": 25.500030517578125,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.286569833755493,
"logits/rejected": -2.2731103897094727,
"logps/chosen": -49.55046844482422,
"logps/rejected": -51.0811767578125,
"loss": 0.6805,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": 0.030641257762908936,
"rewards/margins": 0.027367204427719116,
"rewards/rejected": 0.0032740526366978884,
"step": 130
},
{
"epoch": 0.20178362309701828,
"grad_norm": 19.97886848449707,
"learning_rate": 3.3653846153846154e-07,
"logits/chosen": -2.340399980545044,
"logits/rejected": -2.3109829425811768,
"logps/chosen": -44.300235748291016,
"logps/rejected": -46.56055450439453,
"loss": 0.6755,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.032056886702775955,
"rewards/margins": 0.037834975868463516,
"rewards/rejected": -0.005778087303042412,
"step": 140
},
{
"epoch": 0.2161967390325196,
"grad_norm": 19.921979904174805,
"learning_rate": 3.6057692307692306e-07,
"logits/chosen": -2.3215255737304688,
"logits/rejected": -2.297445297241211,
"logps/chosen": -46.028289794921875,
"logps/rejected": -48.2182731628418,
"loss": 0.6787,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.03457826003432274,
"rewards/margins": 0.03234432265162468,
"rewards/rejected": 0.002233942272141576,
"step": 150
},
{
"epoch": 0.2306098549680209,
"grad_norm": 21.054500579833984,
"learning_rate": 3.8461538461538463e-07,
"logits/chosen": -2.2929115295410156,
"logits/rejected": -2.277684450149536,
"logps/chosen": -47.141380310058594,
"logps/rejected": -50.77402114868164,
"loss": 0.6751,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": 0.04410446435213089,
"rewards/margins": 0.04075505584478378,
"rewards/rejected": 0.0033494061790406704,
"step": 160
},
{
"epoch": 0.2450229709035222,
"grad_norm": 20.726028442382812,
"learning_rate": 4.0865384615384614e-07,
"logits/chosen": -2.33288836479187,
"logits/rejected": -2.3155179023742676,
"logps/chosen": -50.497257232666016,
"logps/rejected": -50.81693649291992,
"loss": 0.6756,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.04269781708717346,
"rewards/margins": 0.040725283324718475,
"rewards/rejected": 0.0019725344609469175,
"step": 170
},
{
"epoch": 0.2594360868390235,
"grad_norm": 23.03353500366211,
"learning_rate": 4.326923076923077e-07,
"logits/chosen": -2.286454677581787,
"logits/rejected": -2.268066883087158,
"logps/chosen": -49.3195915222168,
"logps/rejected": -52.67781448364258,
"loss": 0.6692,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.046625155955553055,
"rewards/margins": 0.056254588067531586,
"rewards/rejected": -0.009629428386688232,
"step": 180
},
{
"epoch": 0.2738492027745248,
"grad_norm": 21.245960235595703,
"learning_rate": 4.567307692307692e-07,
"logits/chosen": -2.3012168407440186,
"logits/rejected": -2.287529468536377,
"logps/chosen": -47.20839309692383,
"logps/rejected": -50.67589569091797,
"loss": 0.6676,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": 0.046871501952409744,
"rewards/margins": 0.06035640090703964,
"rewards/rejected": -0.013484900817275047,
"step": 190
},
{
"epoch": 0.2882623187100261,
"grad_norm": 23.244338989257812,
"learning_rate": 4.807692307692307e-07,
"logits/chosen": -2.324427843093872,
"logits/rejected": -2.3083744049072266,
"logps/chosen": -46.15428924560547,
"logps/rejected": -49.030723571777344,
"loss": 0.6683,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": 0.040220100432634354,
"rewards/margins": 0.06216844171285629,
"rewards/rejected": -0.02194834314286709,
"step": 200
},
{
"epoch": 0.30267543464552743,
"grad_norm": 20.938106536865234,
"learning_rate": 4.999985903160127e-07,
"logits/chosen": -2.3429839611053467,
"logits/rejected": -2.3429884910583496,
"logps/chosen": -46.21355438232422,
"logps/rejected": -49.741477966308594,
"loss": 0.6613,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": 0.03884850814938545,
"rewards/margins": 0.08172162622213364,
"rewards/rejected": -0.04287312179803848,
"step": 210
},
{
"epoch": 0.31708855058102875,
"grad_norm": 22.547351837158203,
"learning_rate": 4.999492530456938e-07,
"logits/chosen": -2.2629776000976562,
"logits/rejected": -2.247462034225464,
"logps/chosen": -45.79121780395508,
"logps/rejected": -48.56629943847656,
"loss": 0.6543,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.02658682130277157,
"rewards/margins": 0.10048248618841171,
"rewards/rejected": -0.07389567047357559,
"step": 220
},
{
"epoch": 0.33150166651653007,
"grad_norm": 28.69328498840332,
"learning_rate": 4.998294474728773e-07,
"logits/chosen": -2.3137152194976807,
"logits/rejected": -2.2834296226501465,
"logps/chosen": -53.14280319213867,
"logps/rejected": -54.3192024230957,
"loss": 0.6477,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.03892205283045769,
"rewards/margins": 0.12172921746969223,
"rewards/rejected": -0.08280716836452484,
"step": 230
},
{
"epoch": 0.34591478245203133,
"grad_norm": 23.599994659423828,
"learning_rate": 4.996392073744008e-07,
"logits/chosen": -2.3293533325195312,
"logits/rejected": -2.2977442741394043,
"logps/chosen": -47.376712799072266,
"logps/rejected": -50.33088302612305,
"loss": 0.6433,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": 0.02417893335223198,
"rewards/margins": 0.12420248985290527,
"rewards/rejected": -0.1000235453248024,
"step": 240
},
{
"epoch": 0.36032789838753265,
"grad_norm": 21.964677810668945,
"learning_rate": 4.993785863847387e-07,
"logits/chosen": -2.2910289764404297,
"logits/rejected": -2.266993284225464,
"logps/chosen": -44.49908447265625,
"logps/rejected": -49.51002883911133,
"loss": 0.6259,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.010445142164826393,
"rewards/margins": 0.18903522193431854,
"rewards/rejected": -0.1785901039838791,
"step": 250
},
{
"epoch": 0.37474101432303397,
"grad_norm": 24.81599998474121,
"learning_rate": 4.99047657980881e-07,
"logits/chosen": -2.2835147380828857,
"logits/rejected": -2.2653086185455322,
"logps/chosen": -50.46863555908203,
"logps/rejected": -54.02223587036133,
"loss": 0.6484,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.045146115124225616,
"rewards/margins": 0.15290267765522003,
"rewards/rejected": -0.19804877042770386,
"step": 260
},
{
"epoch": 0.3891541302585353,
"grad_norm": 26.260005950927734,
"learning_rate": 4.986465154616175e-07,
"logits/chosen": -2.2700905799865723,
"logits/rejected": -2.242027759552002,
"logps/chosen": -46.41443634033203,
"logps/rejected": -49.876991271972656,
"loss": 0.6325,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.017268601804971695,
"rewards/margins": 0.1998191624879837,
"rewards/rejected": -0.2170877754688263,
"step": 270
},
{
"epoch": 0.40356724619403656,
"grad_norm": 24.382686614990234,
"learning_rate": 4.981752719212347e-07,
"logits/chosen": -2.2248587608337402,
"logits/rejected": -2.210576295852661,
"logps/chosen": -48.873863220214844,
"logps/rejected": -51.1252555847168,
"loss": 0.6463,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03884928673505783,
"rewards/margins": 0.15560956299304962,
"rewards/rejected": -0.19445885717868805,
"step": 280
},
{
"epoch": 0.4179803621295379,
"grad_norm": 21.578123092651367,
"learning_rate": 4.976340602176303e-07,
"logits/chosen": -2.2483863830566406,
"logits/rejected": -2.216209650039673,
"logps/chosen": -48.28716278076172,
"logps/rejected": -52.282020568847656,
"loss": 0.6234,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.06292366981506348,
"rewards/margins": 0.23313823342323303,
"rewards/rejected": -0.2960619330406189,
"step": 290
},
{
"epoch": 0.4323934780650392,
"grad_norm": 26.12675666809082,
"learning_rate": 4.970230329348574e-07,
"logits/chosen": -2.246577262878418,
"logits/rejected": -2.2352359294891357,
"logps/chosen": -47.72701644897461,
"logps/rejected": -55.13080596923828,
"loss": 0.6154,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.11627298593521118,
"rewards/margins": 0.2843998968601227,
"rewards/rejected": -0.4006728231906891,
"step": 300
},
{
"epoch": 0.4468065940005405,
"grad_norm": 23.437192916870117,
"learning_rate": 4.963423623401058e-07,
"logits/chosen": -2.2119696140289307,
"logits/rejected": -2.1862404346466064,
"logps/chosen": -47.60432434082031,
"logps/rejected": -51.17278289794922,
"loss": 0.6325,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.10928714275360107,
"rewards/margins": 0.22730882465839386,
"rewards/rejected": -0.33659598231315613,
"step": 310
},
{
"epoch": 0.4612197099360418,
"grad_norm": 25.831727981567383,
"learning_rate": 4.955922403351345e-07,
"logits/chosen": -2.204767942428589,
"logits/rejected": -2.196760892868042,
"logps/chosen": -45.66215896606445,
"logps/rejected": -51.39593505859375,
"loss": 0.6141,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.13322503864765167,
"rewards/margins": 0.2752975821495056,
"rewards/rejected": -0.4085226058959961,
"step": 320
},
{
"epoch": 0.4756328258715431,
"grad_norm": 27.61099624633789,
"learning_rate": 4.947728784021693e-07,
"logits/chosen": -2.219931125640869,
"logits/rejected": -2.2032651901245117,
"logps/chosen": -47.976158142089844,
"logps/rejected": -52.288734436035156,
"loss": 0.6094,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -0.15533845126628876,
"rewards/margins": 0.29920583963394165,
"rewards/rejected": -0.4545443654060364,
"step": 330
},
{
"epoch": 0.4900459418070444,
"grad_norm": 27.884634017944336,
"learning_rate": 4.938845075442793e-07,
"logits/chosen": -2.1479032039642334,
"logits/rejected": -2.1212592124938965,
"logps/chosen": -51.559059143066406,
"logps/rejected": -55.076637268066406,
"loss": 0.6045,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.16438238322734833,
"rewards/margins": 0.29998037219047546,
"rewards/rejected": -0.4643628001213074,
"step": 340
},
{
"epoch": 0.5044590577425457,
"grad_norm": 21.843393325805664,
"learning_rate": 4.929273782202499e-07,
"logits/chosen": -2.188169002532959,
"logits/rejected": -2.1698548793792725,
"logps/chosen": -49.47840881347656,
"logps/rejected": -55.17237091064453,
"loss": 0.6052,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.21972529590129852,
"rewards/margins": 0.2996353209018707,
"rewards/rejected": -0.5193605422973633,
"step": 350
},
{
"epoch": 0.518872173678047,
"grad_norm": 28.664966583251953,
"learning_rate": 4.919017602739709e-07,
"logits/chosen": -2.139723539352417,
"logits/rejected": -2.1237754821777344,
"logps/chosen": -48.365455627441406,
"logps/rejected": -54.59684371948242,
"loss": 0.6069,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.29085400700569153,
"rewards/margins": 0.37286603450775146,
"rewards/rejected": -0.6637200117111206,
"step": 360
},
{
"epoch": 0.5332852896135484,
"grad_norm": 34.82406997680664,
"learning_rate": 4.908079428583598e-07,
"logits/chosen": -2.1522464752197266,
"logits/rejected": -2.134714126586914,
"logps/chosen": -51.907684326171875,
"logps/rejected": -55.71944046020508,
"loss": 0.6183,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.3955734372138977,
"rewards/margins": 0.3038731813430786,
"rewards/rejected": -0.6994466185569763,
"step": 370
},
{
"epoch": 0.5476984055490496,
"grad_norm": 31.894775390625,
"learning_rate": 4.8964623435384e-07,
"logits/chosen": -2.1320347785949707,
"logits/rejected": -2.1204841136932373,
"logps/chosen": -50.67914962768555,
"logps/rejected": -56.83320999145508,
"loss": 0.5889,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.33134475350379944,
"rewards/margins": 0.41313639283180237,
"rewards/rejected": -0.7444812059402466,
"step": 380
},
{
"epoch": 0.5621115214845509,
"grad_norm": 22.113506317138672,
"learning_rate": 4.884169622813997e-07,
"logits/chosen": -2.0845208168029785,
"logits/rejected": -2.0749855041503906,
"logps/chosen": -50.413814544677734,
"logps/rejected": -57.11311721801758,
"loss": 0.5937,
"rewards/accuracies": 0.684374988079071,
"rewards/chosen": -0.4426320493221283,
"rewards/margins": 0.4605481028556824,
"rewards/rejected": -0.9031801223754883,
"step": 390
},
{
"epoch": 0.5765246374200522,
"grad_norm": 29.475505828857422,
"learning_rate": 4.87120473210253e-07,
"logits/chosen": -2.1424341201782227,
"logits/rejected": -2.1282877922058105,
"logps/chosen": -52.59001922607422,
"logps/rejected": -59.03765869140625,
"loss": 0.6062,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -0.4012625813484192,
"rewards/margins": 0.43023520708084106,
"rewards/rejected": -0.8314977884292603,
"step": 400
},
{
"epoch": 0.5909377533555535,
"grad_norm": 31.90033721923828,
"learning_rate": 4.857571326601322e-07,
"logits/chosen": -2.121933698654175,
"logits/rejected": -2.108666181564331,
"logps/chosen": -49.52949523925781,
"logps/rejected": -55.079627990722656,
"loss": 0.6174,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.35214418172836304,
"rewards/margins": 0.37995508313179016,
"rewards/rejected": -0.7320992350578308,
"step": 410
},
{
"epoch": 0.6053508692910549,
"grad_norm": 22.285457611083984,
"learning_rate": 4.843273249982365e-07,
"logits/chosen": -2.1331048011779785,
"logits/rejected": -2.1248154640197754,
"logps/chosen": -47.69127655029297,
"logps/rejected": -53.04596710205078,
"loss": 0.5933,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.29617828130722046,
"rewards/margins": 0.39620500802993774,
"rewards/rejected": -0.6923832297325134,
"step": 420
},
{
"epoch": 0.6197639852265562,
"grad_norm": 30.609222412109375,
"learning_rate": 4.828314533308668e-07,
"logits/chosen": -2.1201605796813965,
"logits/rejected": -2.1039023399353027,
"logps/chosen": -55.71925735473633,
"logps/rejected": -61.24989700317383,
"loss": 0.5893,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.3478863537311554,
"rewards/margins": 0.41252002120018005,
"rewards/rejected": -0.7604063749313354,
"step": 430
},
{
"epoch": 0.6341771011620575,
"grad_norm": 34.9681282043457,
"learning_rate": 4.812699393897779e-07,
"logits/chosen": -2.113286256790161,
"logits/rejected": -2.1023306846618652,
"logps/chosen": -50.840431213378906,
"logps/rejected": -56.244529724121094,
"loss": 0.6264,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3603426516056061,
"rewards/margins": 0.34980452060699463,
"rewards/rejected": -0.7101471424102783,
"step": 440
},
{
"epoch": 0.6485902170975588,
"grad_norm": 40.18833541870117,
"learning_rate": 4.796432234132802e-07,
"logits/chosen": -2.14215350151062,
"logits/rejected": -2.132025718688965,
"logps/chosen": -52.24169921875,
"logps/rejected": -57.869903564453125,
"loss": 0.6175,
"rewards/accuracies": 0.6656249761581421,
"rewards/chosen": -0.34826427698135376,
"rewards/margins": 0.3406437039375305,
"rewards/rejected": -0.6889079809188843,
"step": 450
},
{
"epoch": 0.6630033330330601,
"grad_norm": 29.3867244720459,
"learning_rate": 4.77951764022122e-07,
"logits/chosen": -2.0976574420928955,
"logits/rejected": -2.077042579650879,
"logps/chosen": -49.5283203125,
"logps/rejected": -55.05694580078125,
"loss": 0.5964,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.3073193430900574,
"rewards/margins": 0.3983311057090759,
"rewards/rejected": -0.7056504487991333,
"step": 460
},
{
"epoch": 0.6774164489685613,
"grad_norm": 26.749961853027344,
"learning_rate": 4.7619603809019113e-07,
"logits/chosen": -2.152034282684326,
"logits/rejected": -2.134824275970459,
"logps/chosen": -54.220680236816406,
"logps/rejected": -59.50426483154297,
"loss": 0.5989,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.3346361815929413,
"rewards/margins": 0.3874064087867737,
"rewards/rejected": -0.7220426201820374,
"step": 470
},
{
"epoch": 0.6918295649040627,
"grad_norm": 33.738075256347656,
"learning_rate": 4.7437654061006917e-07,
"logits/chosen": -2.104640007019043,
"logits/rejected": -2.099822521209717,
"logps/chosen": -52.46540069580078,
"logps/rejected": -59.32879638671875,
"loss": 0.6252,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.4632749557495117,
"rewards/margins": 0.3506197929382324,
"rewards/rejected": -0.8138947486877441,
"step": 480
},
{
"epoch": 0.706242680839564,
"grad_norm": 26.89373016357422,
"learning_rate": 4.7249378455347857e-07,
"logits/chosen": -2.1046807765960693,
"logits/rejected": -2.099517583847046,
"logps/chosen": -51.28865432739258,
"logps/rejected": -59.29227828979492,
"loss": 0.6016,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3435845673084259,
"rewards/margins": 0.4029726982116699,
"rewards/rejected": -0.7465572357177734,
"step": 490
},
{
"epoch": 0.7206557967750653,
"grad_norm": 33.128177642822266,
"learning_rate": 4.7054830072665973e-07,
"logits/chosen": -2.086678981781006,
"logits/rejected": -2.07316255569458,
"logps/chosen": -52.46559524536133,
"logps/rejected": -57.55824661254883,
"loss": 0.622,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.34209367632865906,
"rewards/margins": 0.35461074113845825,
"rewards/rejected": -0.6967044472694397,
"step": 500
},
{
"epoch": 0.7350689127105666,
"grad_norm": 30.27744483947754,
"learning_rate": 4.6854063762072106e-07,
"logits/chosen": -2.076245069503784,
"logits/rejected": -2.0618348121643066,
"logps/chosen": -48.739891052246094,
"logps/rejected": -54.260215759277344,
"loss": 0.6086,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -0.3690701127052307,
"rewards/margins": 0.3871278464794159,
"rewards/rejected": -0.7561979293823242,
"step": 510
},
{
"epoch": 0.7494820286460679,
"grad_norm": 23.903032302856445,
"learning_rate": 4.664713612570021e-07,
"logits/chosen": -2.0651895999908447,
"logits/rejected": -2.0512535572052,
"logps/chosen": -48.935279846191406,
"logps/rejected": -55.669349670410156,
"loss": 0.5955,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.3753257393836975,
"rewards/margins": 0.43598484992980957,
"rewards/rejected": -0.8113106489181519,
"step": 520
},
{
"epoch": 0.7638951445815693,
"grad_norm": 26.094806671142578,
"learning_rate": 4.6434105502749533e-07,
"logits/chosen": -2.0776233673095703,
"logits/rejected": -2.047234296798706,
"logps/chosen": -48.24291229248047,
"logps/rejected": -54.696556091308594,
"loss": 0.5923,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.36777353286743164,
"rewards/margins": 0.4470536708831787,
"rewards/rejected": -0.8148272633552551,
"step": 530
},
{
"epoch": 0.7783082605170706,
"grad_norm": 29.16175079345703,
"learning_rate": 4.621503195303701e-07,
"logits/chosen": -2.1113364696502686,
"logits/rejected": -2.104959726333618,
"logps/chosen": -54.480430603027344,
"logps/rejected": -61.843658447265625,
"loss": 0.6007,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.31826671957969666,
"rewards/margins": 0.41609320044517517,
"rewards/rejected": -0.7343599200248718,
"step": 540
},
{
"epoch": 0.7927213764525718,
"grad_norm": 27.188947677612305,
"learning_rate": 4.598997724006456e-07,
"logits/chosen": -2.115569591522217,
"logits/rejected": -2.097104072570801,
"logps/chosen": -53.17041778564453,
"logps/rejected": -60.1922721862793,
"loss": 0.5679,
"rewards/accuracies": 0.7281249761581421,
"rewards/chosen": -0.2969083786010742,
"rewards/margins": 0.47897768020629883,
"rewards/rejected": -0.7758861184120178,
"step": 550
},
{
"epoch": 0.8071344923880731,
"grad_norm": 28.390932083129883,
"learning_rate": 4.5759004813606083e-07,
"logits/chosen": -2.035731554031372,
"logits/rejected": -2.0177547931671143,
"logps/chosen": -50.56719970703125,
"logps/rejected": -56.478172302246094,
"loss": 0.5872,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.4207974374294281,
"rewards/margins": 0.46362677216529846,
"rewards/rejected": -0.8844242095947266,
"step": 560
},
{
"epoch": 0.8215476083235744,
"grad_norm": 26.5084285736084,
"learning_rate": 4.5522179791819036e-07,
"logits/chosen": -2.0884745121002197,
"logits/rejected": -2.078564167022705,
"logps/chosen": -49.79915237426758,
"logps/rejected": -55.561363220214844,
"loss": 0.6137,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.4339517652988434,
"rewards/margins": 0.40265020728111267,
"rewards/rejected": -0.8366019129753113,
"step": 570
},
{
"epoch": 0.8359607242590757,
"grad_norm": 28.170623779296875,
"learning_rate": 4.527956894288564e-07,
"logits/chosen": -2.0642967224121094,
"logits/rejected": -2.0509092807769775,
"logps/chosen": -53.66132354736328,
"logps/rejected": -58.7640266418457,
"loss": 0.6042,
"rewards/accuracies": 0.653124988079071,
"rewards/chosen": -0.3676076829433441,
"rewards/margins": 0.39320507645606995,
"rewards/rejected": -0.7608126401901245,
"step": 580
},
{
"epoch": 0.8503738401945771,
"grad_norm": 39.17332077026367,
"learning_rate": 4.503124066618891e-07,
"logits/chosen": -2.1024057865142822,
"logits/rejected": -2.072375535964966,
"logps/chosen": -52.28911590576172,
"logps/rejected": -56.1530647277832,
"loss": 0.6153,
"rewards/accuracies": 0.6343749761581421,
"rewards/chosen": -0.4251914918422699,
"rewards/margins": 0.3595213294029236,
"rewards/rejected": -0.7847127914428711,
"step": 590
},
{
"epoch": 0.8647869561300784,
"grad_norm": 30.457704544067383,
"learning_rate": 4.4777264973028763e-07,
"logits/chosen": -2.0802459716796875,
"logits/rejected": -2.054129123687744,
"logps/chosen": -55.20709228515625,
"logps/rejected": -60.87419891357422,
"loss": 0.5882,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.45640072226524353,
"rewards/margins": 0.43457716703414917,
"rewards/rejected": -0.8909778594970703,
"step": 600
},
{
"epoch": 0.8792000720655797,
"grad_norm": 24.48934555053711,
"learning_rate": 4.4517713466883733e-07,
"logits/chosen": -2.0442166328430176,
"logits/rejected": -2.0350658893585205,
"logps/chosen": -49.768959045410156,
"logps/rejected": -56.72099685668945,
"loss": 0.6119,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.44116124510765076,
"rewards/margins": 0.39726558327674866,
"rewards/rejected": -0.8384267687797546,
"step": 610
},
{
"epoch": 0.893613188001081,
"grad_norm": 31.750471115112305,
"learning_rate": 4.425265932322374e-07,
"logits/chosen": -2.050736904144287,
"logits/rejected": -2.0227198600769043,
"logps/chosen": -50.26643753051758,
"logps/rejected": -57.786712646484375,
"loss": 0.5635,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.40668320655822754,
"rewards/margins": 0.5360392332077026,
"rewards/rejected": -0.9427223205566406,
"step": 620
},
{
"epoch": 0.9080263039365822,
"grad_norm": 26.52891731262207,
"learning_rate": 4.3982177268879713e-07,
"logits/chosen": -2.0627334117889404,
"logits/rejected": -2.0695691108703613,
"logps/chosen": -53.98938751220703,
"logps/rejected": -62.720008850097656,
"loss": 0.6038,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -0.5125323534011841,
"rewards/margins": 0.3868643641471863,
"rewards/rejected": -0.8993967771530151,
"step": 630
},
{
"epoch": 0.9224394198720836,
"grad_norm": 33.650299072265625,
"learning_rate": 4.370634356097582e-07,
"logits/chosen": -2.04744553565979,
"logits/rejected": -2.026310443878174,
"logps/chosen": -51.46614456176758,
"logps/rejected": -59.32075881958008,
"loss": 0.5673,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.5189432501792908,
"rewards/margins": 0.5335994958877563,
"rewards/rejected": -1.0525426864624023,
"step": 640
},
{
"epoch": 0.9368525358075849,
"grad_norm": 27.66231346130371,
"learning_rate": 4.3425235965430267e-07,
"logits/chosen": -2.044187545776367,
"logits/rejected": -2.0403525829315186,
"logps/chosen": -53.0405158996582,
"logps/rejected": -59.9106330871582,
"loss": 0.5888,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6265466213226318,
"rewards/margins": 0.45962271094322205,
"rewards/rejected": -1.0861692428588867,
"step": 650
},
{
"epoch": 0.9512656517430862,
"grad_norm": 20.190792083740234,
"learning_rate": 4.3138933735030723e-07,
"logits/chosen": -2.0359790325164795,
"logits/rejected": -2.0277514457702637,
"logps/chosen": -50.206233978271484,
"logps/rejected": -58.02119827270508,
"loss": 0.5932,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5592349767684937,
"rewards/margins": 0.50580894947052,
"rewards/rejected": -1.0650438070297241,
"step": 660
},
{
"epoch": 0.9656787676785875,
"grad_norm": 27.47838020324707,
"learning_rate": 4.284751758709052e-07,
"logits/chosen": -2.0135226249694824,
"logits/rejected": -1.993334174156189,
"logps/chosen": -55.60878372192383,
"logps/rejected": -62.4648323059082,
"loss": 0.6215,
"rewards/accuracies": 0.690625011920929,
"rewards/chosen": -0.6603598594665527,
"rewards/margins": 0.49340876936912537,
"rewards/rejected": -1.153768539428711,
"step": 670
},
{
"epoch": 0.9800918836140888,
"grad_norm": 27.680631637573242,
"learning_rate": 4.255106968069201e-07,
"logits/chosen": -1.9804503917694092,
"logits/rejected": -1.9623844623565674,
"logps/chosen": -53.62810134887695,
"logps/rejected": -59.58030319213867,
"loss": 0.5732,
"rewards/accuracies": 0.659375011920929,
"rewards/chosen": -0.5149959325790405,
"rewards/margins": 0.5002261400222778,
"rewards/rejected": -1.0152220726013184,
"step": 680
},
{
"epoch": 0.9945049995495902,
"grad_norm": 27.70201873779297,
"learning_rate": 4.2249673593523427e-07,
"logits/chosen": -2.009028673171997,
"logits/rejected": -1.9938583374023438,
"logps/chosen": -51.165802001953125,
"logps/rejected": -57.561912536621094,
"loss": 0.6249,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5677297711372375,
"rewards/margins": 0.3783959150314331,
"rewards/rejected": -0.9461256265640259,
"step": 690
},
{
"epoch": 1.0089181154850915,
"grad_norm": 22.3559627532959,
"learning_rate": 4.194341429831576e-07,
"logits/chosen": -1.9959580898284912,
"logits/rejected": -1.9894912242889404,
"logps/chosen": -50.8765754699707,
"logps/rejected": -58.38984298706055,
"loss": 0.5678,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.47180747985839844,
"rewards/margins": 0.547687828540802,
"rewards/rejected": -1.0194952487945557,
"step": 700
},
{
"epoch": 1.0233312314205927,
"grad_norm": 26.511709213256836,
"learning_rate": 4.163237813888639e-07,
"logits/chosen": -1.9872064590454102,
"logits/rejected": -1.9805561304092407,
"logps/chosen": -51.93730545043945,
"logps/rejected": -61.05963134765625,
"loss": 0.5136,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.45482367277145386,
"rewards/margins": 0.6706832647323608,
"rewards/rejected": -1.1255069971084595,
"step": 710
},
{
"epoch": 1.037744347356094,
"grad_norm": 23.604045867919922,
"learning_rate": 4.1316652805796103e-07,
"logits/chosen": -2.0082569122314453,
"logits/rejected": -1.9880993366241455,
"logps/chosen": -53.464210510253906,
"logps/rejected": -60.89324951171875,
"loss": 0.5112,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.5722960233688354,
"rewards/margins": 0.7038034796714783,
"rewards/rejected": -1.276099443435669,
"step": 720
},
{
"epoch": 1.0521574632915953,
"grad_norm": 27.071109771728516,
"learning_rate": 4.09963273116265e-07,
"logits/chosen": -1.957069993019104,
"logits/rejected": -1.946636438369751,
"logps/chosen": -51.59601974487305,
"logps/rejected": -63.518707275390625,
"loss": 0.5106,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6144558787345886,
"rewards/margins": 0.7481251358985901,
"rewards/rejected": -1.3625810146331787,
"step": 730
},
{
"epoch": 1.0665705792270967,
"grad_norm": 25.10072135925293,
"learning_rate": 4.0671491965884575e-07,
"logits/chosen": -1.903646469116211,
"logits/rejected": -1.879151701927185,
"logps/chosen": -50.908939361572266,
"logps/rejected": -60.83250045776367,
"loss": 0.4971,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -0.5956512689590454,
"rewards/margins": 0.7306423783302307,
"rewards/rejected": -1.326293706893921,
"step": 740
},
{
"epoch": 1.080983695162598,
"grad_norm": 27.48199462890625,
"learning_rate": 4.034223834954178e-07,
"logits/chosen": -1.883504867553711,
"logits/rejected": -1.8606479167938232,
"logps/chosen": -56.974342346191406,
"logps/rejected": -66.8110580444336,
"loss": 0.4872,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -0.7325400114059448,
"rewards/margins": 0.8243793249130249,
"rewards/rejected": -1.5569193363189697,
"step": 750
},
{
"epoch": 1.0953968110980994,
"grad_norm": 37.682456970214844,
"learning_rate": 4.000865928921453e-07,
"logits/chosen": -1.8376388549804688,
"logits/rejected": -1.8330386877059937,
"logps/chosen": -57.87821578979492,
"logps/rejected": -65.87061309814453,
"loss": 0.5093,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.9233464002609253,
"rewards/margins": 0.7766343355178833,
"rewards/rejected": -1.6999807357788086,
"step": 760
},
{
"epoch": 1.1098099270336006,
"grad_norm": 29.70059585571289,
"learning_rate": 3.967084883099356e-07,
"logits/chosen": -1.843123435974121,
"logits/rejected": -1.8221601247787476,
"logps/chosen": -56.51304244995117,
"logps/rejected": -67.86689758300781,
"loss": 0.5015,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9158357381820679,
"rewards/margins": 0.8356377482414246,
"rewards/rejected": -1.7514736652374268,
"step": 770
},
{
"epoch": 1.1242230429691018,
"grad_norm": 35.48520278930664,
"learning_rate": 3.932890221392945e-07,
"logits/chosen": -1.8560593128204346,
"logits/rejected": -1.8394947052001953,
"logps/chosen": -58.27685546875,
"logps/rejected": -69.38211822509766,
"loss": 0.4989,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.931163489818573,
"rewards/margins": 0.8966981768608093,
"rewards/rejected": -1.8278617858886719,
"step": 780
},
{
"epoch": 1.1386361589046032,
"grad_norm": 29.43699073791504,
"learning_rate": 3.8982915843181873e-07,
"logits/chosen": -1.8902702331542969,
"logits/rejected": -1.8777059316635132,
"logps/chosen": -57.33638381958008,
"logps/rejected": -68.3757095336914,
"loss": 0.4924,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8548051714897156,
"rewards/margins": 0.8410500288009644,
"rewards/rejected": -1.6958551406860352,
"step": 790
},
{
"epoch": 1.1530492748401044,
"grad_norm": 34.55091094970703,
"learning_rate": 3.8632987262840035e-07,
"logits/chosen": -1.842508316040039,
"logits/rejected": -1.8238246440887451,
"logps/chosen": -56.97047805786133,
"logps/rejected": -67.20683288574219,
"loss": 0.5201,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9612547755241394,
"rewards/margins": 0.7814940214157104,
"rewards/rejected": -1.7427488565444946,
"step": 800
},
{
"epoch": 1.1674623907756059,
"grad_norm": 33.40706253051758,
"learning_rate": 3.8279215128422e-07,
"logits/chosen": -1.911771535873413,
"logits/rejected": -1.9070079326629639,
"logps/chosen": -57.28943634033203,
"logps/rejected": -66.23017883300781,
"loss": 0.528,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8022342920303345,
"rewards/margins": 0.6388766765594482,
"rewards/rejected": -1.4411109685897827,
"step": 810
},
{
"epoch": 1.181875506711107,
"grad_norm": 26.427078247070312,
"learning_rate": 3.792169917906075e-07,
"logits/chosen": -1.8850971460342407,
"logits/rejected": -1.872187852859497,
"logps/chosen": -56.62986373901367,
"logps/rejected": -63.857086181640625,
"loss": 0.499,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.7205672264099121,
"rewards/margins": 0.7504978179931641,
"rewards/rejected": -1.4710649251937866,
"step": 820
},
{
"epoch": 1.1962886226466085,
"grad_norm": 36.381370544433594,
"learning_rate": 3.7560540209384623e-07,
"logits/chosen": -1.8158347606658936,
"logits/rejected": -1.8129236698150635,
"logps/chosen": -51.83489227294922,
"logps/rejected": -60.24357986450195,
"loss": 0.5324,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -0.8396095037460327,
"rewards/margins": 0.7031906843185425,
"rewards/rejected": -1.5428001880645752,
"step": 830
},
{
"epoch": 1.2107017385821097,
"grad_norm": 19.65403938293457,
"learning_rate": 3.719584004110028e-07,
"logits/chosen": -1.8365529775619507,
"logits/rejected": -1.8232309818267822,
"logps/chosen": -55.054298400878906,
"logps/rejected": -66.90823364257812,
"loss": 0.4757,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.955776572227478,
"rewards/margins": 0.8788663744926453,
"rewards/rejected": -1.8346431255340576,
"step": 840
},
{
"epoch": 1.225114854517611,
"grad_norm": 23.882118225097656,
"learning_rate": 3.6827701494286073e-07,
"logits/chosen": -1.7997219562530518,
"logits/rejected": -1.7894538640975952,
"logps/chosen": -61.450218200683594,
"logps/rejected": -72.72858428955078,
"loss": 0.491,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.0577744245529175,
"rewards/margins": 0.9473785161972046,
"rewards/rejected": -2.005152940750122,
"step": 850
},
{
"epoch": 1.2395279704531124,
"grad_norm": 36.03754806518555,
"learning_rate": 3.6456228358403906e-07,
"logits/chosen": -1.7837250232696533,
"logits/rejected": -1.7650636434555054,
"logps/chosen": -52.76055908203125,
"logps/rejected": -64.60010528564453,
"loss": 0.4848,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -0.9400711059570312,
"rewards/margins": 0.9297584295272827,
"rewards/rejected": -1.869829535484314,
"step": 860
},
{
"epoch": 1.2539410863886136,
"grad_norm": 26.958053588867188,
"learning_rate": 3.608152536303784e-07,
"logits/chosen": -1.8296825885772705,
"logits/rejected": -1.8235883712768555,
"logps/chosen": -57.7917594909668,
"logps/rejected": -69.37285614013672,
"loss": 0.4746,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -0.9810036420822144,
"rewards/margins": 0.9666363000869751,
"rewards/rejected": -1.9476398229599,
"step": 870
},
{
"epoch": 1.268354202324115,
"grad_norm": 41.44581985473633,
"learning_rate": 3.570369814836765e-07,
"logits/chosen": -1.8446134328842163,
"logits/rejected": -1.8234678506851196,
"logps/chosen": -62.26701736450195,
"logps/rejected": -72.74183654785156,
"loss": 0.4605,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.065502643585205,
"rewards/margins": 1.0432883501052856,
"rewards/rejected": -2.1087911128997803,
"step": 880
},
{
"epoch": 1.2827673182596162,
"grad_norm": 33.43363952636719,
"learning_rate": 3.532285323538562e-07,
"logits/chosen": -1.8326313495635986,
"logits/rejected": -1.8126726150512695,
"logps/chosen": -58.24235916137695,
"logps/rejected": -67.00035858154297,
"loss": 0.519,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9739583134651184,
"rewards/margins": 0.7338631749153137,
"rewards/rejected": -1.7078216075897217,
"step": 890
},
{
"epoch": 1.2971804341951176,
"grad_norm": 28.870502471923828,
"learning_rate": 3.493909799586503e-07,
"logits/chosen": -1.830145239830017,
"logits/rejected": -1.8221817016601562,
"logps/chosen": -55.99808883666992,
"logps/rejected": -66.1963882446289,
"loss": 0.5362,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.8822668790817261,
"rewards/margins": 0.6761519908905029,
"rewards/rejected": -1.5584189891815186,
"step": 900
},
{
"epoch": 1.3115935501306188,
"grad_norm": 37.35979461669922,
"learning_rate": 3.4552540622088826e-07,
"logits/chosen": -1.7925065755844116,
"logits/rejected": -1.7871220111846924,
"logps/chosen": -52.560211181640625,
"logps/rejected": -63.6556396484375,
"loss": 0.5004,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8926779627799988,
"rewards/margins": 0.8026574850082397,
"rewards/rejected": -1.6953353881835938,
"step": 910
},
{
"epoch": 1.32600666606612,
"grad_norm": 25.262666702270508,
"learning_rate": 3.416329009634687e-07,
"logits/chosen": -1.76953125,
"logits/rejected": -1.7501062154769897,
"logps/chosen": -55.17573928833008,
"logps/rejected": -65.00733184814453,
"loss": 0.4877,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9110902547836304,
"rewards/margins": 0.8462222218513489,
"rewards/rejected": -1.757312536239624,
"step": 920
},
{
"epoch": 1.3404197820016215,
"grad_norm": 32.45104217529297,
"learning_rate": 3.377145616021055e-07,
"logits/chosen": -1.8046982288360596,
"logits/rejected": -1.7909055948257446,
"logps/chosen": -57.70347213745117,
"logps/rejected": -69.15714263916016,
"loss": 0.4964,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -1.0629938840866089,
"rewards/margins": 0.8470731973648071,
"rewards/rejected": -1.9100669622421265,
"step": 930
},
{
"epoch": 1.354832897937123,
"grad_norm": 39.708580017089844,
"learning_rate": 3.337714928359326e-07,
"logits/chosen": -1.7561019659042358,
"logits/rejected": -1.7309824228286743,
"logps/chosen": -56.85551834106445,
"logps/rejected": -67.09632873535156,
"loss": 0.4932,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -1.0887633562088013,
"rewards/margins": 0.8313824534416199,
"rewards/rejected": -1.9201457500457764,
"step": 940
},
{
"epoch": 1.3692460138726241,
"grad_norm": 27.966632843017578,
"learning_rate": 3.2980480633605616e-07,
"logits/chosen": -1.7208874225616455,
"logits/rejected": -1.7174994945526123,
"logps/chosen": -57.519561767578125,
"logps/rejected": -69.14530944824219,
"loss": 0.4866,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1282824277877808,
"rewards/margins": 0.8654249906539917,
"rewards/rejected": -1.993707299232483,
"step": 950
},
{
"epoch": 1.3836591298081253,
"grad_norm": 26.737092971801758,
"learning_rate": 3.2581562043214015e-07,
"logits/chosen": -1.7533900737762451,
"logits/rejected": -1.7562023401260376,
"logps/chosen": -54.5827751159668,
"logps/rejected": -67.70366668701172,
"loss": 0.4853,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.157460331916809,
"rewards/margins": 0.9240506887435913,
"rewards/rejected": -2.0815110206604004,
"step": 960
},
{
"epoch": 1.3980722457436268,
"grad_norm": 28.375288009643555,
"learning_rate": 3.2180505979711557e-07,
"logits/chosen": -1.7657930850982666,
"logits/rejected": -1.7565358877182007,
"logps/chosen": -59.47906494140625,
"logps/rejected": -68.41764831542969,
"loss": 0.5293,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1187207698822021,
"rewards/margins": 0.7722535729408264,
"rewards/rejected": -1.8909746408462524,
"step": 970
},
{
"epoch": 1.412485361679128,
"grad_norm": 34.44431686401367,
"learning_rate": 3.1777425513010055e-07,
"logits/chosen": -1.7356555461883545,
"logits/rejected": -1.7229560613632202,
"logps/chosen": -57.01166915893555,
"logps/rejected": -69.17829895019531,
"loss": 0.4943,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.2534024715423584,
"rewards/margins": 0.9338359832763672,
"rewards/rejected": -2.1872386932373047,
"step": 980
},
{
"epoch": 1.4268984776146292,
"grad_norm": 22.93629264831543,
"learning_rate": 3.1372434283762205e-07,
"logits/chosen": -1.7858479022979736,
"logits/rejected": -1.777570366859436,
"logps/chosen": -55.632896423339844,
"logps/rejected": -67.86164855957031,
"loss": 0.5142,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1818989515304565,
"rewards/margins": 0.8865016102790833,
"rewards/rejected": -2.0684006214141846,
"step": 990
},
{
"epoch": 1.4413115935501306,
"grad_norm": 27.421680450439453,
"learning_rate": 3.0965646471322844e-07,
"logits/chosen": -1.797176718711853,
"logits/rejected": -1.782179594039917,
"logps/chosen": -53.52863311767578,
"logps/rejected": -65.38296508789062,
"loss": 0.4965,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0196263790130615,
"rewards/margins": 0.8526620864868164,
"rewards/rejected": -1.8722883462905884,
"step": 1000
},
{
"epoch": 1.455724709485632,
"grad_norm": 28.67449951171875,
"learning_rate": 3.055717676155827e-07,
"logits/chosen": -1.7956054210662842,
"logits/rejected": -1.782934546470642,
"logps/chosen": -54.95515060424805,
"logps/rejected": -64.83052825927734,
"loss": 0.52,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9566730260848999,
"rewards/margins": 0.7631611227989197,
"rewards/rejected": -1.7198339700698853,
"step": 1010
},
{
"epoch": 1.4701378254211332,
"grad_norm": 29.06233024597168,
"learning_rate": 3.0147140314512853e-07,
"logits/chosen": -1.8102481365203857,
"logits/rejected": -1.806222915649414,
"logps/chosen": -58.04301071166992,
"logps/rejected": -70.16658020019531,
"loss": 0.4972,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -0.9563525915145874,
"rewards/margins": 0.8730182647705078,
"rewards/rejected": -1.8293708562850952,
"step": 1020
},
{
"epoch": 1.4845509413566345,
"grad_norm": 19.513904571533203,
"learning_rate": 2.973565273194188e-07,
"logits/chosen": -1.7972570657730103,
"logits/rejected": -1.7693393230438232,
"logps/chosen": -58.28984451293945,
"logps/rejected": -66.8588638305664,
"loss": 0.4968,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.035782814025879,
"rewards/margins": 0.8242384195327759,
"rewards/rejected": -1.8600209951400757,
"step": 1030
},
{
"epoch": 1.4989640572921359,
"grad_norm": 29.952312469482422,
"learning_rate": 2.932283002471991e-07,
"logits/chosen": -1.8092374801635742,
"logits/rejected": -1.7851841449737549,
"logps/chosen": -60.324623107910156,
"logps/rejected": -69.95586395263672,
"loss": 0.5181,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.007617712020874,
"rewards/margins": 0.7622562646865845,
"rewards/rejected": -1.7698739767074585,
"step": 1040
},
{
"epoch": 1.513377173227637,
"grad_norm": 25.340177536010742,
"learning_rate": 2.89087885801338e-07,
"logits/chosen": -1.8064777851104736,
"logits/rejected": -1.806052565574646,
"logps/chosen": -51.601837158203125,
"logps/rejected": -61.78096389770508,
"loss": 0.4989,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.838117778301239,
"rewards/margins": 0.8140629529953003,
"rewards/rejected": -1.6521809101104736,
"step": 1050
},
{
"epoch": 1.5277902891631383,
"grad_norm": 37.2698860168457,
"learning_rate": 2.8493645129069535e-07,
"logits/chosen": -1.7569599151611328,
"logits/rejected": -1.730499029159546,
"logps/chosen": -56.78059005737305,
"logps/rejected": -66.43777465820312,
"loss": 0.4999,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8391516804695129,
"rewards/margins": 0.7788572311401367,
"rewards/rejected": -1.6180089712142944,
"step": 1060
},
{
"epoch": 1.5422034050986397,
"grad_norm": 31.261423110961914,
"learning_rate": 2.807751671310231e-07,
"logits/chosen": -1.7782785892486572,
"logits/rejected": -1.7672075033187866,
"logps/chosen": -61.70476150512695,
"logps/rejected": -71.72880554199219,
"loss": 0.4953,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -0.9828069806098938,
"rewards/margins": 0.8402408361434937,
"rewards/rejected": -1.8230478763580322,
"step": 1070
},
{
"epoch": 1.5566165210341412,
"grad_norm": 27.57630157470703,
"learning_rate": 2.7660520651498853e-07,
"logits/chosen": -1.778074026107788,
"logits/rejected": -1.7654485702514648,
"logps/chosen": -61.545814514160156,
"logps/rejected": -71.43892669677734,
"loss": 0.4792,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.9759114384651184,
"rewards/margins": 0.8692989349365234,
"rewards/rejected": -1.845210313796997,
"step": 1080
},
{
"epoch": 1.5710296369696424,
"grad_norm": 35.753257751464844,
"learning_rate": 2.7242774508141663e-07,
"logits/chosen": -1.7737147808074951,
"logits/rejected": -1.7685718536376953,
"logps/chosen": -58.334930419921875,
"logps/rejected": -69.20314025878906,
"loss": 0.5223,
"rewards/accuracies": 0.715624988079071,
"rewards/chosen": -0.9689555168151855,
"rewards/margins": 0.7592560648918152,
"rewards/rejected": -1.7282116413116455,
"step": 1090
},
{
"epoch": 1.5854427529051436,
"grad_norm": 29.153667449951172,
"learning_rate": 2.682439605838408e-07,
"logits/chosen": -1.8176618814468384,
"logits/rejected": -1.812830924987793,
"logps/chosen": -57.24834442138672,
"logps/rejected": -68.6217041015625,
"loss": 0.5081,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.8450204730033875,
"rewards/margins": 0.7896521687507629,
"rewards/rejected": -1.6346725225448608,
"step": 1100
},
{
"epoch": 1.599855868840645,
"grad_norm": 31.53214454650879,
"learning_rate": 2.6405503255845875e-07,
"logits/chosen": -1.82248055934906,
"logits/rejected": -1.814016580581665,
"logps/chosen": -59.1510124206543,
"logps/rejected": -70.27528381347656,
"loss": 0.4777,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.7771763801574707,
"rewards/margins": 0.8746312856674194,
"rewards/rejected": -1.6518075466156006,
"step": 1110
},
{
"epoch": 1.6142689847761464,
"grad_norm": 32.77677536010742,
"learning_rate": 2.598621419915853e-07,
"logits/chosen": -1.757372260093689,
"logits/rejected": -1.7502870559692383,
"logps/chosen": -53.64165496826172,
"logps/rejected": -66.46639251708984,
"loss": 0.4758,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -0.850943922996521,
"rewards/margins": 0.9450072050094604,
"rewards/rejected": -1.795951247215271,
"step": 1120
},
{
"epoch": 1.6286821007116477,
"grad_norm": 45.913150787353516,
"learning_rate": 2.5566647098669636e-07,
"logits/chosen": -1.8059905767440796,
"logits/rejected": -1.7845344543457031,
"logps/chosen": -55.63075637817383,
"logps/rejected": -65.79679870605469,
"loss": 0.5267,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.853289008140564,
"rewards/margins": 0.7583447098731995,
"rewards/rejected": -1.6116336584091187,
"step": 1130
},
{
"epoch": 1.6430952166471489,
"grad_norm": 32.05988311767578,
"learning_rate": 2.5146920243115764e-07,
"logits/chosen": -1.7860431671142578,
"logits/rejected": -1.7688003778457642,
"logps/chosen": -56.956817626953125,
"logps/rejected": -65.7645263671875,
"loss": 0.5318,
"rewards/accuracies": 0.721875011920929,
"rewards/chosen": -0.8115145564079285,
"rewards/margins": 0.6689954996109009,
"rewards/rejected": -1.4805099964141846,
"step": 1140
},
{
"epoch": 1.6575083325826503,
"grad_norm": 25.664705276489258,
"learning_rate": 2.4727151966273337e-07,
"logits/chosen": -1.7770273685455322,
"logits/rejected": -1.7563207149505615,
"logps/chosen": -55.638404846191406,
"logps/rejected": -62.886566162109375,
"loss": 0.5011,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.8054786920547485,
"rewards/margins": 0.7769336104393005,
"rewards/rejected": -1.5824123620986938,
"step": 1150
},
{
"epoch": 1.6719214485181515,
"grad_norm": 28.833738327026367,
"learning_rate": 2.4307460613596694e-07,
"logits/chosen": -1.8111257553100586,
"logits/rejected": -1.7967822551727295,
"logps/chosen": -57.33094024658203,
"logps/rejected": -67.30760192871094,
"loss": 0.4935,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.799383819103241,
"rewards/margins": 0.8343574404716492,
"rewards/rejected": -1.6337411403656006,
"step": 1160
},
{
"epoch": 1.6863345644536527,
"grad_norm": 21.161962509155273,
"learning_rate": 2.388796450885288e-07,
"logits/chosen": -1.8059934377670288,
"logits/rejected": -1.798081398010254,
"logps/chosen": -50.3988151550293,
"logps/rejected": -62.55088424682617,
"loss": 0.465,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -0.8187153935432434,
"rewards/margins": 0.9717043042182922,
"rewards/rejected": -1.7904198169708252,
"step": 1170
},
{
"epoch": 1.7007476803891541,
"grad_norm": 33.9837760925293,
"learning_rate": 2.3468781920762646e-07,
"logits/chosen": -1.7467705011367798,
"logits/rejected": -1.7256418466567993,
"logps/chosen": -58.491493225097656,
"logps/rejected": -70.8143310546875,
"loss": 0.4539,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -0.9473183751106262,
"rewards/margins": 1.0221917629241943,
"rewards/rejected": -1.9695100784301758,
"step": 1180
},
{
"epoch": 1.7151607963246556,
"grad_norm": 25.674482345581055,
"learning_rate": 2.3050031029656825e-07,
"logits/chosen": -1.800931692123413,
"logits/rejected": -1.7879711389541626,
"logps/chosen": -60.068443298339844,
"logps/rejected": -69.20748138427734,
"loss": 0.493,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9745752215385437,
"rewards/margins": 0.8433617353439331,
"rewards/rejected": -1.817936897277832,
"step": 1190
},
{
"epoch": 1.7295739122601568,
"grad_norm": 32.89895248413086,
"learning_rate": 2.2631829894157754e-07,
"logits/chosen": -1.76886785030365,
"logits/rejected": -1.754063606262207,
"logps/chosen": -58.20952224731445,
"logps/rejected": -69.31002044677734,
"loss": 0.4935,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.061232328414917,
"rewards/margins": 0.8504399061203003,
"rewards/rejected": -1.9116723537445068,
"step": 1200
},
{
"epoch": 1.743987028195658,
"grad_norm": 30.052446365356445,
"learning_rate": 2.2214296417894906e-07,
"logits/chosen": -1.7634525299072266,
"logits/rejected": -1.7487919330596924,
"logps/chosen": -55.20353317260742,
"logps/rejected": -65.99549865722656,
"loss": 0.4979,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -0.9935828447341919,
"rewards/margins": 0.8899686932563782,
"rewards/rejected": -1.8835513591766357,
"step": 1210
},
{
"epoch": 1.7584001441311594,
"grad_norm": 31.799606323242188,
"learning_rate": 2.1797548316264319e-07,
"logits/chosen": -1.7502672672271729,
"logits/rejected": -1.740473747253418,
"logps/chosen": -56.68674850463867,
"logps/rejected": -67.84220123291016,
"loss": 0.5064,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.0200823545455933,
"rewards/margins": 0.803787887096405,
"rewards/rejected": -1.823870062828064,
"step": 1220
},
{
"epoch": 1.7728132600666606,
"grad_norm": 32.939762115478516,
"learning_rate": 2.1381703083240987e-07,
"logits/chosen": -1.810317039489746,
"logits/rejected": -1.7904523611068726,
"logps/chosen": -56.20463943481445,
"logps/rejected": -65.8335189819336,
"loss": 0.4865,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.9756546020507812,
"rewards/margins": 0.9413552284240723,
"rewards/rejected": -1.917009711265564,
"step": 1230
},
{
"epoch": 1.7872263760021618,
"grad_norm": 26.08432388305664,
"learning_rate": 2.0966877958253787e-07,
"logits/chosen": -1.8199723958969116,
"logits/rejected": -1.801325798034668,
"logps/chosen": -61.7148323059082,
"logps/rejected": -71.31344604492188,
"loss": 0.4977,
"rewards/accuracies": 0.753125011920929,
"rewards/chosen": -1.037131667137146,
"rewards/margins": 0.8753975033760071,
"rewards/rejected": -1.9125293493270874,
"step": 1240
},
{
"epoch": 1.8016394919376633,
"grad_norm": 32.24626922607422,
"learning_rate": 2.0553189893132076e-07,
"logits/chosen": -1.7616393566131592,
"logits/rejected": -1.7485902309417725,
"logps/chosen": -53.9367561340332,
"logps/rejected": -66.37442016601562,
"loss": 0.4515,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9454119801521301,
"rewards/margins": 0.9221888780593872,
"rewards/rejected": -1.8676010370254517,
"step": 1250
},
{
"epoch": 1.8160526078731647,
"grad_norm": 23.929574966430664,
"learning_rate": 2.0140755519133434e-07,
"logits/chosen": -1.71337890625,
"logits/rejected": -1.7015049457550049,
"logps/chosen": -59.80029296875,
"logps/rejected": -70.41039276123047,
"loss": 0.4801,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -1.0442065000534058,
"rewards/margins": 0.8991094827651978,
"rewards/rejected": -1.943315863609314,
"step": 1260
},
{
"epoch": 1.830465723808666,
"grad_norm": 25.99103355407715,
"learning_rate": 1.9729691114061736e-07,
"logits/chosen": -1.7196881771087646,
"logits/rejected": -1.7151161432266235,
"logps/chosen": -53.206695556640625,
"logps/rejected": -67.19075012207031,
"loss": 0.4637,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0664688348770142,
"rewards/margins": 1.1020677089691162,
"rewards/rejected": -2.16853666305542,
"step": 1270
},
{
"epoch": 1.844878839744167,
"grad_norm": 32.41804122924805,
"learning_rate": 1.9320112569484946e-07,
"logits/chosen": -1.7552703619003296,
"logits/rejected": -1.7278478145599365,
"logps/chosen": -55.47632598876953,
"logps/rejected": -64.41392517089844,
"loss": 0.5581,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.140154480934143,
"rewards/margins": 0.7451371550559998,
"rewards/rejected": -1.8852916955947876,
"step": 1280
},
{
"epoch": 1.8592919556796685,
"grad_norm": 32.51744079589844,
"learning_rate": 1.8912135358061694e-07,
"logits/chosen": -1.796501874923706,
"logits/rejected": -1.779625654220581,
"logps/chosen": -58.72721481323242,
"logps/rejected": -69.07386779785156,
"loss": 0.4605,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0752990245819092,
"rewards/margins": 0.8940299153327942,
"rewards/rejected": -1.9693288803100586,
"step": 1290
},
{
"epoch": 1.87370507161517,
"grad_norm": 29.42111587524414,
"learning_rate": 1.8505874500986088e-07,
"logits/chosen": -1.78672194480896,
"logits/rejected": -1.7740051746368408,
"logps/chosen": -53.470664978027344,
"logps/rejected": -64.02452087402344,
"loss": 0.4957,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -1.0004963874816895,
"rewards/margins": 0.8680494427680969,
"rewards/rejected": -1.8685458898544312,
"step": 1300
},
{
"epoch": 1.888118187550671,
"grad_norm": 36.914894104003906,
"learning_rate": 1.8101444535559656e-07,
"logits/chosen": -1.7614473104476929,
"logits/rejected": -1.7427914142608643,
"logps/chosen": -55.6743049621582,
"logps/rejected": -66.42680358886719,
"loss": 0.4846,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.0334885120391846,
"rewards/margins": 0.9464027285575867,
"rewards/rejected": -1.9798911809921265,
"step": 1310
},
{
"epoch": 1.9025313034861724,
"grad_norm": 32.4942512512207,
"learning_rate": 1.769895948289989e-07,
"logits/chosen": -1.8014084100723267,
"logits/rejected": -1.7888679504394531,
"logps/chosen": -62.43854904174805,
"logps/rejected": -71.75282287597656,
"loss": 0.4944,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.093829870223999,
"rewards/margins": 0.8591756820678711,
"rewards/rejected": -1.9530055522918701,
"step": 1320
},
{
"epoch": 1.9169444194216738,
"grad_norm": 35.89755630493164,
"learning_rate": 1.729853281579414e-07,
"logits/chosen": -1.7554981708526611,
"logits/rejected": -1.7466237545013428,
"logps/chosen": -62.69568634033203,
"logps/rejected": -75.16825866699219,
"loss": 0.4606,
"rewards/accuracies": 0.778124988079071,
"rewards/chosen": -1.1235264539718628,
"rewards/margins": 0.9610961079597473,
"rewards/rejected": -2.084622383117676,
"step": 1330
},
{
"epoch": 1.931357535357175,
"grad_norm": 23.533727645874023,
"learning_rate": 1.6900277426708222e-07,
"logits/chosen": -1.7824815511703491,
"logits/rejected": -1.7752597332000732,
"logps/chosen": -59.05015182495117,
"logps/rejected": -70.8111343383789,
"loss": 0.5018,
"rewards/accuracies": 0.753125011920929,
"rewards/chosen": -1.1483525037765503,
"rewards/margins": 0.8362733125686646,
"rewards/rejected": -1.9846255779266357,
"step": 1340
},
{
"epoch": 1.9457706512926762,
"grad_norm": 43.141048431396484,
"learning_rate": 1.650430559595859e-07,
"logits/chosen": -1.7303861379623413,
"logits/rejected": -1.7137393951416016,
"logps/chosen": -62.492454528808594,
"logps/rejected": -71.88333129882812,
"loss": 0.4932,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2485052347183228,
"rewards/margins": 0.8395478129386902,
"rewards/rejected": -2.088052749633789,
"step": 1350
},
{
"epoch": 1.9601837672281777,
"grad_norm": 34.501712799072266,
"learning_rate": 1.6110728960057106e-07,
"logits/chosen": -1.7456886768341064,
"logits/rejected": -1.7302719354629517,
"logps/chosen": -56.20106887817383,
"logps/rejected": -67.40886688232422,
"loss": 0.5116,
"rewards/accuracies": 0.7406250238418579,
"rewards/chosen": -1.1765556335449219,
"rewards/margins": 0.8575465083122253,
"rewards/rejected": -2.034101963043213,
"step": 1360
},
{
"epoch": 1.974596883163679,
"grad_norm": 44.137115478515625,
"learning_rate": 1.5719658480237269e-07,
"logits/chosen": -1.7316787242889404,
"logits/rejected": -1.7204573154449463,
"logps/chosen": -57.36134719848633,
"logps/rejected": -68.31610870361328,
"loss": 0.5027,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.3111565113067627,
"rewards/margins": 0.8917368054389954,
"rewards/rejected": -2.2028934955596924,
"step": 1370
},
{
"epoch": 1.9890099990991803,
"grad_norm": 32.499935150146484,
"learning_rate": 1.5331204411170932e-07,
"logits/chosen": -1.7521297931671143,
"logits/rejected": -1.7524783611297607,
"logps/chosen": -61.551841735839844,
"logps/rejected": -73.5947265625,
"loss": 0.4825,
"rewards/accuracies": 0.7593749761581421,
"rewards/chosen": -1.2590062618255615,
"rewards/margins": 0.8878059387207031,
"rewards/rejected": -2.1468119621276855,
"step": 1380
},
{
"epoch": 2.0034231150346815,
"grad_norm": 35.52438735961914,
"learning_rate": 1.494547626988408e-07,
"logits/chosen": -1.7367178201675415,
"logits/rejected": -1.7304236888885498,
"logps/chosen": -59.0101203918457,
"logps/rejected": -69.7065658569336,
"loss": 0.4775,
"rewards/accuracies": 0.7718750238418579,
"rewards/chosen": -1.156428575515747,
"rewards/margins": 0.8491467237472534,
"rewards/rejected": -2.005575656890869,
"step": 1390
},
{
"epoch": 2.017836230970183,
"grad_norm": 28.009803771972656,
"learning_rate": 1.456258280488073e-07,
"logits/chosen": -1.691402792930603,
"logits/rejected": -1.6789098978042603,
"logps/chosen": -55.73322677612305,
"logps/rejected": -70.45665740966797,
"loss": 0.4085,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.081807255744934,
"rewards/margins": 1.1556535959243774,
"rewards/rejected": -2.2374606132507324,
"step": 1400
},
{
"epoch": 2.0322493469056844,
"grad_norm": 27.49490737915039,
"learning_rate": 1.4182631965483305e-07,
"logits/chosen": -1.7311683893203735,
"logits/rejected": -1.726595163345337,
"logps/chosen": -58.620765686035156,
"logps/rejected": -72.42903137207031,
"loss": 0.4229,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.149526834487915,
"rewards/margins": 1.0777475833892822,
"rewards/rejected": -2.2272744178771973,
"step": 1410
},
{
"epoch": 2.0466624628411854,
"grad_norm": 30.135372161865234,
"learning_rate": 1.3805730871398584e-07,
"logits/chosen": -1.6934964656829834,
"logits/rejected": -1.677056908607483,
"logps/chosen": -53.71419143676758,
"logps/rejected": -67.37553405761719,
"loss": 0.3983,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.1637048721313477,
"rewards/margins": 1.193371057510376,
"rewards/rejected": -2.3570759296417236,
"step": 1420
},
{
"epoch": 2.061075578776687,
"grad_norm": 26.996715545654297,
"learning_rate": 1.3431985782517226e-07,
"logits/chosen": -1.6753069162368774,
"logits/rejected": -1.65777587890625,
"logps/chosen": -57.29792404174805,
"logps/rejected": -70.36094665527344,
"loss": 0.4289,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.2193354368209839,
"rewards/margins": 1.0907495021820068,
"rewards/rejected": -2.310084819793701,
"step": 1430
},
{
"epoch": 2.075488694712188,
"grad_norm": 30.4396915435791,
"learning_rate": 1.3061502068956042e-07,
"logits/chosen": -1.6357700824737549,
"logits/rejected": -1.6307754516601562,
"logps/chosen": -56.32429885864258,
"logps/rejected": -70.00544738769531,
"loss": 0.3999,
"rewards/accuracies": 0.8218749761581421,
"rewards/chosen": -1.1628742218017578,
"rewards/margins": 1.1366623640060425,
"rewards/rejected": -2.2995364665985107,
"step": 1440
},
{
"epoch": 2.089901810647689,
"grad_norm": 33.75346374511719,
"learning_rate": 1.2694384181350792e-07,
"logits/chosen": -1.6029478311538696,
"logits/rejected": -1.5960829257965088,
"logps/chosen": -62.79957962036133,
"logps/rejected": -76.08135223388672,
"loss": 0.4139,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.2355473041534424,
"rewards/margins": 1.1187636852264404,
"rewards/rejected": -2.3543107509613037,
"step": 1450
},
{
"epoch": 2.1043149265831906,
"grad_norm": 23.287256240844727,
"learning_rate": 1.2330735621408514e-07,
"logits/chosen": -1.6741046905517578,
"logits/rejected": -1.6551471948623657,
"logps/chosen": -58.5982780456543,
"logps/rejected": -71.42253112792969,
"loss": 0.3971,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2658870220184326,
"rewards/margins": 1.1862525939941406,
"rewards/rejected": -2.4521396160125732,
"step": 1460
},
{
"epoch": 2.118728042518692,
"grad_norm": 24.4993896484375,
"learning_rate": 1.1970658912727138e-07,
"logits/chosen": -1.612346887588501,
"logits/rejected": -1.5920403003692627,
"logps/chosen": -59.92454147338867,
"logps/rejected": -73.3915786743164,
"loss": 0.4134,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.242612600326538,
"rewards/margins": 1.216597557067871,
"rewards/rejected": -2.459210157394409,
"step": 1470
},
{
"epoch": 2.1331411584541935,
"grad_norm": 41.4117546081543,
"learning_rate": 1.1614255571891016e-07,
"logits/chosen": -1.5743157863616943,
"logits/rejected": -1.5713180303573608,
"logps/chosen": -61.38301467895508,
"logps/rejected": -75.31806182861328,
"loss": 0.4241,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4350389242172241,
"rewards/margins": 1.2023990154266357,
"rewards/rejected": -2.6374378204345703,
"step": 1480
},
{
"epoch": 2.1475542743896945,
"grad_norm": 25.956771850585938,
"learning_rate": 1.1261626079850295e-07,
"logits/chosen": -1.5780308246612549,
"logits/rejected": -1.5647315979003906,
"logps/chosen": -59.95500564575195,
"logps/rejected": -76.34349060058594,
"loss": 0.3777,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.2126551866531372,
"rewards/margins": 1.368710994720459,
"rewards/rejected": -2.5813660621643066,
"step": 1490
},
{
"epoch": 2.161967390325196,
"grad_norm": 31.3956298828125,
"learning_rate": 1.0912869853592247e-07,
"logits/chosen": -1.6019783020019531,
"logits/rejected": -1.5898057222366333,
"logps/chosen": -61.808135986328125,
"logps/rejected": -77.27191925048828,
"loss": 0.3935,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3138659000396729,
"rewards/margins": 1.354296088218689,
"rewards/rejected": -2.6681618690490723,
"step": 1500
},
{
"epoch": 2.1763805062606973,
"grad_norm": 35.722694396972656,
"learning_rate": 1.056808521811268e-07,
"logits/chosen": -1.561440348625183,
"logits/rejected": -1.546866774559021,
"logps/chosen": -57.341331481933594,
"logps/rejected": -72.38762664794922,
"loss": 0.4121,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2446476221084595,
"rewards/margins": 1.2853491306304932,
"rewards/rejected": -2.529996633529663,
"step": 1510
},
{
"epoch": 2.1907936221961988,
"grad_norm": 38.04680252075195,
"learning_rate": 1.0227369378695006e-07,
"logits/chosen": -1.60148024559021,
"logits/rejected": -1.5893213748931885,
"logps/chosen": -63.2020378112793,
"logps/rejected": -78.1455307006836,
"loss": 0.3897,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.4317493438720703,
"rewards/margins": 1.2837940454483032,
"rewards/rejected": -2.715543270111084,
"step": 1520
},
{
"epoch": 2.2052067381316998,
"grad_norm": 29.301164627075195,
"learning_rate": 9.890818393505251e-08,
"logits/chosen": -1.586211085319519,
"logits/rejected": -1.5707120895385742,
"logps/chosen": -65.30186462402344,
"logps/rejected": -80.65480041503906,
"loss": 0.3727,
"rewards/accuracies": 0.8656250238418579,
"rewards/chosen": -1.4216526746749878,
"rewards/margins": 1.3828752040863037,
"rewards/rejected": -2.804527997970581,
"step": 1530
},
{
"epoch": 2.219619854067201,
"grad_norm": 29.37394905090332,
"learning_rate": 9.55852714651019e-08,
"logits/chosen": -1.5325640439987183,
"logits/rejected": -1.5217316150665283,
"logps/chosen": -61.10272979736328,
"logps/rejected": -73.1271743774414,
"loss": 0.416,
"rewards/accuracies": 0.7906249761581421,
"rewards/chosen": -1.4346240758895874,
"rewards/margins": 1.1002185344696045,
"rewards/rejected": -2.5348427295684814,
"step": 1540
},
{
"epoch": 2.2340329700027026,
"grad_norm": 26.91488265991211,
"learning_rate": 9.230589320726809e-08,
"logits/chosen": -1.540071725845337,
"logits/rejected": -1.5339853763580322,
"logps/chosen": -57.176513671875,
"logps/rejected": -70.31219482421875,
"loss": 0.4096,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -1.4375228881835938,
"rewards/margins": 1.2083842754364014,
"rewards/rejected": -2.645906925201416,
"step": 1550
},
{
"epoch": 2.2484460859382036,
"grad_norm": 31.109094619750977,
"learning_rate": 8.907097371810085e-08,
"logits/chosen": -1.5355430841445923,
"logits/rejected": -1.530775785446167,
"logps/chosen": -59.644500732421875,
"logps/rejected": -74.54759216308594,
"loss": 0.411,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -1.4771003723144531,
"rewards/margins": 1.1839910745620728,
"rewards/rejected": -2.6610913276672363,
"step": 1560
},
{
"epoch": 2.262859201873705,
"grad_norm": 25.765758514404297,
"learning_rate": 8.588142501987017e-08,
"logits/chosen": -1.5403801202774048,
"logits/rejected": -1.522972822189331,
"logps/chosen": -60.12090301513672,
"logps/rejected": -74.10574340820312,
"loss": 0.4106,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4168094396591187,
"rewards/margins": 1.23050856590271,
"rewards/rejected": -2.647318124771118,
"step": 1570
},
{
"epoch": 2.2772723178092065,
"grad_norm": 35.56646728515625,
"learning_rate": 8.273814634343893e-08,
"logits/chosen": -1.5780024528503418,
"logits/rejected": -1.5696873664855957,
"logps/chosen": -56.7363166809082,
"logps/rejected": -71.40894317626953,
"loss": 0.4262,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.4435478448867798,
"rewards/margins": 1.132821798324585,
"rewards/rejected": -2.5763697624206543,
"step": 1580
},
{
"epoch": 2.291685433744708,
"grad_norm": 29.11915397644043,
"learning_rate": 7.96420238747425e-08,
"logits/chosen": -1.547071933746338,
"logits/rejected": -1.5279027223587036,
"logps/chosen": -59.5390510559082,
"logps/rejected": -74.87039184570312,
"loss": 0.4048,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.40023934841156,
"rewards/margins": 1.345477819442749,
"rewards/rejected": -2.7457172870635986,
"step": 1590
},
{
"epoch": 2.306098549680209,
"grad_norm": 32.61679458618164,
"learning_rate": 7.659393050494595e-08,
"logits/chosen": -1.5456218719482422,
"logits/rejected": -1.5411933660507202,
"logps/chosen": -65.36607360839844,
"logps/rejected": -77.77400970458984,
"loss": 0.4655,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -1.5160075426101685,
"rewards/margins": 1.0541882514953613,
"rewards/rejected": -2.5701956748962402,
"step": 1600
},
{
"epoch": 2.3205116656157103,
"grad_norm": 40.657920837402344,
"learning_rate": 7.35947255843494e-08,
"logits/chosen": -1.5437052249908447,
"logits/rejected": -1.532766580581665,
"logps/chosen": -62.3788948059082,
"logps/rejected": -73.6574478149414,
"loss": 0.4469,
"rewards/accuracies": 0.809374988079071,
"rewards/chosen": -1.4102076292037964,
"rewards/margins": 1.109969139099121,
"rewards/rejected": -2.520177125930786,
"step": 1610
},
{
"epoch": 2.3349247815512117,
"grad_norm": 26.726566314697266,
"learning_rate": 7.064525468011107e-08,
"logits/chosen": -1.5550864934921265,
"logits/rejected": -1.5475775003433228,
"logps/chosen": -57.881187438964844,
"logps/rejected": -72.7550048828125,
"loss": 0.3932,
"rewards/accuracies": 0.809374988079071,
"rewards/chosen": -1.3459597826004028,
"rewards/margins": 1.2489373683929443,
"rewards/rejected": -2.5948970317840576,
"step": 1620
},
{
"epoch": 2.3493378974867127,
"grad_norm": 26.51803970336914,
"learning_rate": 6.774634933785611e-08,
"logits/chosen": -1.5511729717254639,
"logits/rejected": -1.5490328073501587,
"logps/chosen": -63.644187927246094,
"logps/rejected": -77.2373046875,
"loss": 0.4253,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.3264816999435425,
"rewards/margins": 1.1635551452636719,
"rewards/rejected": -2.490036725997925,
"step": 1630
},
{
"epoch": 2.363751013422214,
"grad_norm": 37.48248291015625,
"learning_rate": 6.489882684723872e-08,
"logits/chosen": -1.6325349807739258,
"logits/rejected": -1.611288070678711,
"logps/chosen": -59.67763137817383,
"logps/rejected": -74.39009857177734,
"loss": 0.407,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.3188283443450928,
"rewards/margins": 1.2371572256088257,
"rewards/rejected": -2.555985927581787,
"step": 1640
},
{
"epoch": 2.3781641293577156,
"grad_norm": 33.88801956176758,
"learning_rate": 6.210349001152304e-08,
"logits/chosen": -1.56001877784729,
"logits/rejected": -1.545484185218811,
"logps/chosen": -54.10548782348633,
"logps/rejected": -68.94291687011719,
"loss": 0.4144,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2859638929367065,
"rewards/margins": 1.1985052824020386,
"rewards/rejected": -2.484469175338745,
"step": 1650
},
{
"epoch": 2.392577245293217,
"grad_norm": 27.400054931640625,
"learning_rate": 5.936112692124884e-08,
"logits/chosen": -1.5780709981918335,
"logits/rejected": -1.5682373046875,
"logps/chosen": -61.6357421875,
"logps/rejected": -77.30577850341797,
"loss": 0.407,
"rewards/accuracies": 0.809374988079071,
"rewards/chosen": -1.3840547800064087,
"rewards/margins": 1.2264368534088135,
"rewards/rejected": -2.6104912757873535,
"step": 1660
},
{
"epoch": 2.406990361228718,
"grad_norm": 38.04377746582031,
"learning_rate": 5.66725107320444e-08,
"logits/chosen": -1.546272873878479,
"logits/rejected": -1.5431301593780518,
"logps/chosen": -62.60614776611328,
"logps/rejected": -76.22834014892578,
"loss": 0.4048,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.454508900642395,
"rewards/margins": 1.2125307321548462,
"rewards/rejected": -2.667039394378662,
"step": 1670
},
{
"epoch": 2.4214034771642194,
"grad_norm": 34.18000411987305,
"learning_rate": 5.403839944665081e-08,
"logits/chosen": -1.617413878440857,
"logits/rejected": -1.618520736694336,
"logps/chosen": -62.6517333984375,
"logps/rejected": -76.1615982055664,
"loss": 0.4568,
"rewards/accuracies": 0.746874988079071,
"rewards/chosen": -1.4979777336120605,
"rewards/margins": 1.052832007408142,
"rewards/rejected": -2.550809860229492,
"step": 1680
},
{
"epoch": 2.435816593099721,
"grad_norm": 25.391855239868164,
"learning_rate": 5.1459535701217694e-08,
"logits/chosen": -1.5523961782455444,
"logits/rejected": -1.539656400680542,
"logps/chosen": -59.02225875854492,
"logps/rejected": -77.31755065917969,
"loss": 0.3576,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.3841768503189087,
"rewards/margins": 1.4941140413284302,
"rewards/rejected": -2.8782906532287598,
"step": 1690
},
{
"epoch": 2.450229709035222,
"grad_norm": 27.22433090209961,
"learning_rate": 4.8936646555931245e-08,
"logits/chosen": -1.5299670696258545,
"logits/rejected": -1.5135704278945923,
"logps/chosen": -59.751251220703125,
"logps/rejected": -74.75291442871094,
"loss": 0.3773,
"rewards/accuracies": 0.815625011920929,
"rewards/chosen": -1.446244239807129,
"rewards/margins": 1.4083815813064575,
"rewards/rejected": -2.854625701904297,
"step": 1700
},
{
"epoch": 2.4646428249707233,
"grad_norm": 27.11460304260254,
"learning_rate": 4.647044329003458e-08,
"logits/chosen": -1.5377050638198853,
"logits/rejected": -1.5185314416885376,
"logps/chosen": -62.594635009765625,
"logps/rejected": -76.7301254272461,
"loss": 0.4388,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.5232594013214111,
"rewards/margins": 1.201012372970581,
"rewards/rejected": -2.7242720127105713,
"step": 1710
},
{
"epoch": 2.4790559409062247,
"grad_norm": 26.611129760742188,
"learning_rate": 4.406162120129548e-08,
"logits/chosen": -1.5484386682510376,
"logits/rejected": -1.5278335809707642,
"logps/chosen": -63.33378982543945,
"logps/rejected": -78.29924774169922,
"loss": 0.4024,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.523051142692566,
"rewards/margins": 1.292851209640503,
"rewards/rejected": -2.8159022331237793,
"step": 1720
},
{
"epoch": 2.493469056841726,
"grad_norm": 27.76645851135254,
"learning_rate": 4.171085940998176e-08,
"logits/chosen": -1.5417293310165405,
"logits/rejected": -1.531988501548767,
"logps/chosen": -62.10388946533203,
"logps/rejected": -76.06646728515625,
"loss": 0.388,
"rewards/accuracies": 0.8218749761581421,
"rewards/chosen": -1.474600076675415,
"rewards/margins": 1.2824805974960327,
"rewards/rejected": -2.757080554962158,
"step": 1730
},
{
"epoch": 2.507882172777227,
"grad_norm": 34.98466110229492,
"learning_rate": 3.941882066739569e-08,
"logits/chosen": -1.5331491231918335,
"logits/rejected": -1.5120114088058472,
"logps/chosen": -59.76226806640625,
"logps/rejected": -75.82611083984375,
"loss": 0.3892,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.515598177909851,
"rewards/margins": 1.3365103006362915,
"rewards/rejected": -2.8521084785461426,
"step": 1740
},
{
"epoch": 2.5222952887127286,
"grad_norm": 38.75511169433594,
"learning_rate": 3.71861511690251e-08,
"logits/chosen": -1.4918270111083984,
"logits/rejected": -1.4848248958587646,
"logps/chosen": -62.17346954345703,
"logps/rejected": -76.3984375,
"loss": 0.3918,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.542625069618225,
"rewards/margins": 1.2679589986801147,
"rewards/rejected": -2.81058406829834,
"step": 1750
},
{
"epoch": 2.53670840464823,
"grad_norm": 39.63160705566406,
"learning_rate": 3.5013480372360373e-08,
"logits/chosen": -1.5405272245407104,
"logits/rejected": -1.5281922817230225,
"logps/chosen": -65.55313873291016,
"logps/rejected": -79.51661682128906,
"loss": 0.3812,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.5600948333740234,
"rewards/margins": 1.3620662689208984,
"rewards/rejected": -2.922161102294922,
"step": 1760
},
{
"epoch": 2.551121520583731,
"grad_norm": 35.51730728149414,
"learning_rate": 3.290142081943184e-08,
"logits/chosen": -1.5395221710205078,
"logits/rejected": -1.5303739309310913,
"logps/chosen": -66.13626098632812,
"logps/rejected": -80.83757019042969,
"loss": 0.3976,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.5481364727020264,
"rewards/margins": 1.3121622800827026,
"rewards/rejected": -2.8602986335754395,
"step": 1770
},
{
"epoch": 2.5655346365192324,
"grad_norm": 32.397037506103516,
"learning_rate": 3.085056796411528e-08,
"logits/chosen": -1.4974429607391357,
"logits/rejected": -1.4794594049453735,
"logps/chosen": -67.4955825805664,
"logps/rejected": -81.5915298461914,
"loss": 0.4235,
"rewards/accuracies": 0.7906249761581421,
"rewards/chosen": -1.6371116638183594,
"rewards/margins": 1.2419166564941406,
"rewards/rejected": -2.879027843475342,
"step": 1780
},
{
"epoch": 2.579947752454734,
"grad_norm": 30.48466682434082,
"learning_rate": 2.8861500004255328e-08,
"logits/chosen": -1.5640184879302979,
"logits/rejected": -1.5465617179870605,
"logps/chosen": -62.553428649902344,
"logps/rejected": -75.51476287841797,
"loss": 0.4137,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.5564630031585693,
"rewards/margins": 1.250534176826477,
"rewards/rejected": -2.806997299194336,
"step": 1790
},
{
"epoch": 2.5943608683902353,
"grad_norm": 33.36497497558594,
"learning_rate": 2.6934777718653988e-08,
"logits/chosen": -1.5691068172454834,
"logits/rejected": -1.550433874130249,
"logps/chosen": -64.53910827636719,
"logps/rejected": -80.43321228027344,
"loss": 0.3994,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.5547971725463867,
"rewards/margins": 1.2979726791381836,
"rewards/rejected": -2.8527698516845703,
"step": 1800
},
{
"epoch": 2.6087739843257363,
"grad_norm": 22.623056411743164,
"learning_rate": 2.507094430897e-08,
"logits/chosen": -1.5123775005340576,
"logits/rejected": -1.4988584518432617,
"logps/chosen": -62.6684684753418,
"logps/rejected": -78.53128051757812,
"loss": 0.3838,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5202034711837769,
"rewards/margins": 1.3458402156829834,
"rewards/rejected": -2.8660435676574707,
"step": 1810
},
{
"epoch": 2.6231871002612377,
"grad_norm": 27.248733520507812,
"learning_rate": 2.3270525246573717e-08,
"logits/chosen": -1.5637412071228027,
"logits/rejected": -1.5629525184631348,
"logps/chosen": -63.67070770263672,
"logps/rejected": -78.90650939941406,
"loss": 0.3855,
"rewards/accuracies": 0.815625011920929,
"rewards/chosen": -1.4608145952224731,
"rewards/margins": 1.333916187286377,
"rewards/rejected": -2.7947306632995605,
"step": 1820
},
{
"epoch": 2.637600216196739,
"grad_norm": 44.09688186645508,
"learning_rate": 2.153402812440075e-08,
"logits/chosen": -1.5495104789733887,
"logits/rejected": -1.5366135835647583,
"logps/chosen": -62.17615509033203,
"logps/rejected": -77.5916748046875,
"loss": 0.4239,
"rewards/accuracies": 0.809374988079071,
"rewards/chosen": -1.4907768964767456,
"rewards/margins": 1.2403948307037354,
"rewards/rejected": -2.7311718463897705,
"step": 1830
},
{
"epoch": 2.65201333213224,
"grad_norm": 39.35377883911133,
"learning_rate": 1.9861942513846126e-08,
"logits/chosen": -1.5815684795379639,
"logits/rejected": -1.570434808731079,
"logps/chosen": -68.02040100097656,
"logps/rejected": -82.66510009765625,
"loss": 0.4036,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.595801830291748,
"rewards/margins": 1.288498878479004,
"rewards/rejected": -2.884300708770752,
"step": 1840
},
{
"epoch": 2.6664264480677415,
"grad_norm": 35.275421142578125,
"learning_rate": 1.8254739826739087e-08,
"logits/chosen": -1.5678503513336182,
"logits/rejected": -1.5563210248947144,
"logps/chosen": -64.60910034179688,
"logps/rejected": -81.83099365234375,
"loss": 0.3813,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.5075803995132446,
"rewards/margins": 1.4048585891723633,
"rewards/rejected": -2.9124391078948975,
"step": 1850
},
{
"epoch": 2.680839564003243,
"grad_norm": 36.913028717041016,
"learning_rate": 1.6712873182437915e-08,
"logits/chosen": -1.6048858165740967,
"logits/rejected": -1.5977307558059692,
"logps/chosen": -62.97712326049805,
"logps/rejected": -77.24687194824219,
"loss": 0.406,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5249515771865845,
"rewards/margins": 1.1899343729019165,
"rewards/rejected": -2.714885711669922,
"step": 1860
},
{
"epoch": 2.6952526799387444,
"grad_norm": 30.57379150390625,
"learning_rate": 1.5236777280081603e-08,
"logits/chosen": -1.545700192451477,
"logits/rejected": -1.527111291885376,
"logps/chosen": -60.37244415283203,
"logps/rejected": -75.10139465332031,
"loss": 0.4116,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4861081838607788,
"rewards/margins": 1.261631965637207,
"rewards/rejected": -2.7477405071258545,
"step": 1870
},
{
"epoch": 2.709665795874246,
"grad_norm": 34.18118667602539,
"learning_rate": 1.3826868276035103e-08,
"logits/chosen": -1.5901832580566406,
"logits/rejected": -1.5766841173171997,
"logps/chosen": -66.63416290283203,
"logps/rejected": -81.32105255126953,
"loss": 0.3765,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.4631272554397583,
"rewards/margins": 1.3422093391418457,
"rewards/rejected": -2.8053364753723145,
"step": 1880
},
{
"epoch": 2.724078911809747,
"grad_norm": 27.860313415527344,
"learning_rate": 1.2483543666562097e-08,
"logits/chosen": -1.5422031879425049,
"logits/rejected": -1.5396907329559326,
"logps/chosen": -59.41462326049805,
"logps/rejected": -74.75579071044922,
"loss": 0.4246,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.4775583744049072,
"rewards/margins": 1.207562804222107,
"rewards/rejected": -2.6851210594177246,
"step": 1890
},
{
"epoch": 2.7384920277452482,
"grad_norm": 28.36042022705078,
"learning_rate": 1.1207182175758585e-08,
"logits/chosen": -1.548557996749878,
"logits/rejected": -1.534714937210083,
"logps/chosen": -59.4514045715332,
"logps/rejected": -74.65626525878906,
"loss": 0.4059,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3794094324111938,
"rewards/margins": 1.263185739517212,
"rewards/rejected": -2.642595052719116,
"step": 1900
},
{
"epoch": 2.7529051436807492,
"grad_norm": 27.59717559814453,
"learning_rate": 9.998143648779434e-09,
"logits/chosen": -1.5367798805236816,
"logits/rejected": -1.5305159091949463,
"logps/chosen": -61.03411102294922,
"logps/rejected": -76.11595153808594,
"loss": 0.4241,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4926228523254395,
"rewards/margins": 1.2798919677734375,
"rewards/rejected": -2.772514820098877,
"step": 1910
},
{
"epoch": 2.7673182596162507,
"grad_norm": 32.14241027832031,
"learning_rate": 8.856768950386478e-09,
"logits/chosen": -1.577178716659546,
"logits/rejected": -1.5694324970245361,
"logps/chosen": -57.841270446777344,
"logps/rejected": -71.33273315429688,
"loss": 0.4478,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -1.4872713088989258,
"rewards/margins": 1.0947293043136597,
"rewards/rejected": -2.582000255584717,
"step": 1920
},
{
"epoch": 2.781731375551752,
"grad_norm": 29.319686889648438,
"learning_rate": 7.783379868849e-09,
"logits/chosen": -1.5487927198410034,
"logits/rejected": -1.5374157428741455,
"logps/chosen": -64.15359497070312,
"logps/rejected": -78.96765899658203,
"loss": 0.3825,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.4542877674102783,
"rewards/margins": 1.3190295696258545,
"rewards/rejected": -2.773317337036133,
"step": 1930
},
{
"epoch": 2.7961444914872535,
"grad_norm": 34.06153869628906,
"learning_rate": 6.778279025221212e-09,
"logits/chosen": -1.5599983930587769,
"logits/rejected": -1.5473382472991943,
"logps/chosen": -62.492393493652344,
"logps/rejected": -79.02606964111328,
"loss": 0.3907,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.478379726409912,
"rewards/margins": 1.287481665611267,
"rewards/rejected": -2.7658615112304688,
"step": 1940
},
{
"epoch": 2.810557607422755,
"grad_norm": 31.845796585083008,
"learning_rate": 5.841749788024791e-09,
"logits/chosen": -1.5394327640533447,
"logits/rejected": -1.5315256118774414,
"logps/chosen": -60.67429733276367,
"logps/rejected": -74.8219223022461,
"loss": 0.4335,
"rewards/accuracies": 0.809374988079071,
"rewards/chosen": -1.486366629600525,
"rewards/margins": 1.1716400384902954,
"rewards/rejected": -2.6580066680908203,
"step": 1950
},
{
"epoch": 2.824970723358256,
"grad_norm": 29.72622299194336,
"learning_rate": 4.974056193358084e-09,
"logits/chosen": -1.5817995071411133,
"logits/rejected": -1.561623454093933,
"logps/chosen": -66.63758850097656,
"logps/rejected": -82.53173065185547,
"loss": 0.3726,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -1.407747745513916,
"rewards/margins": 1.4503003358840942,
"rewards/rejected": -2.8580479621887207,
"step": 1960
},
{
"epoch": 2.8393838392937574,
"grad_norm": 33.73102951049805,
"learning_rate": 4.175442870456708e-09,
"logits/chosen": -1.510568618774414,
"logits/rejected": -1.4960753917694092,
"logps/chosen": -62.57592010498047,
"logps/rejected": -76.44905090332031,
"loss": 0.4293,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.516035795211792,
"rewards/margins": 1.1771646738052368,
"rewards/rejected": -2.6932003498077393,
"step": 1970
},
{
"epoch": 2.8537969552292584,
"grad_norm": 37.108036041259766,
"learning_rate": 3.44613497272489e-09,
"logits/chosen": -1.5102002620697021,
"logits/rejected": -1.4990047216415405,
"logps/chosen": -64.62440490722656,
"logps/rejected": -77.6097412109375,
"loss": 0.4306,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.4141771793365479,
"rewards/margins": 1.1328486204147339,
"rewards/rejected": -2.547025680541992,
"step": 1980
},
{
"epoch": 2.86821007116476,
"grad_norm": 28.47416114807129,
"learning_rate": 2.786338114258019e-09,
"logits/chosen": -1.5360634326934814,
"logits/rejected": -1.5190080404281616,
"logps/chosen": -62.32354736328125,
"logps/rejected": -79.61341094970703,
"loss": 0.3866,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.382712721824646,
"rewards/margins": 1.3544337749481201,
"rewards/rejected": -2.7371468544006348,
"step": 1990
},
{
"epoch": 2.882623187100261,
"grad_norm": 33.172950744628906,
"learning_rate": 2.1962383118736828e-09,
"logits/chosen": -1.5397237539291382,
"logits/rejected": -1.5202221870422363,
"logps/chosen": -61.6288948059082,
"logps/rejected": -77.49903869628906,
"loss": 0.3673,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": -1.417991280555725,
"rewards/margins": 1.4083993434906006,
"rewards/rejected": -2.826390504837036,
"step": 2000
},
{
"epoch": 2.8970363030357626,
"grad_norm": 41.858070373535156,
"learning_rate": 1.6760019326678698e-09,
"logits/chosen": -1.5235203504562378,
"logits/rejected": -1.5131398439407349,
"logps/chosen": -62.8912353515625,
"logps/rejected": -75.68180847167969,
"loss": 0.44,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.5128400325775146,
"rewards/margins": 1.1097246408462524,
"rewards/rejected": -2.6225647926330566,
"step": 2010
},
{
"epoch": 2.911449418971264,
"grad_norm": 46.39678955078125,
"learning_rate": 1.2257756471110437e-09,
"logits/chosen": -1.5751731395721436,
"logits/rejected": -1.5582482814788818,
"logps/chosen": -64.07471466064453,
"logps/rejected": -77.11708068847656,
"loss": 0.4344,
"rewards/accuracies": 0.8218749761581421,
"rewards/chosen": -1.452874779701233,
"rewards/margins": 1.1758126020431519,
"rewards/rejected": -2.6286873817443848,
"step": 2020
},
{
"epoch": 2.925862534906765,
"grad_norm": 44.330745697021484,
"learning_rate": 8.456863876973586e-10,
"logits/chosen": -1.5380871295928955,
"logits/rejected": -1.531022310256958,
"logps/chosen": -61.59229278564453,
"logps/rejected": -74.27851867675781,
"loss": 0.4564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.5201137065887451,
"rewards/margins": 1.1149994134902954,
"rewards/rejected": -2.635113000869751,
"step": 2030
},
{
"epoch": 2.9402756508422665,
"grad_norm": 33.99905014038086,
"learning_rate": 5.358413131582861e-10,
"logits/chosen": -1.5820564031600952,
"logits/rejected": -1.5623215436935425,
"logps/chosen": -62.37836837768555,
"logps/rejected": -79.43901062011719,
"loss": 0.3971,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.5787489414215088,
"rewards/margins": 1.3647538423538208,
"rewards/rejected": -2.943502902984619,
"step": 2040
},
{
"epoch": 2.954688766777768,
"grad_norm": 32.752960205078125,
"learning_rate": 2.963277782515872e-10,
"logits/chosen": -1.581578016281128,
"logits/rejected": -1.5734798908233643,
"logps/chosen": -63.77600860595703,
"logps/rejected": -78.80097198486328,
"loss": 0.4043,
"rewards/accuracies": 0.8218749761581421,
"rewards/chosen": -1.5265841484069824,
"rewards/margins": 1.2426446676254272,
"rewards/rejected": -2.769228935241699,
"step": 2050
},
{
"epoch": 2.969101882713269,
"grad_norm": 28.23070526123047,
"learning_rate": 1.272133091331229e-10,
"logits/chosen": -1.5508421659469604,
"logits/rejected": -1.5296354293823242,
"logps/chosen": -57.603782653808594,
"logps/rejected": -73.52263641357422,
"loss": 0.3884,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -1.363713026046753,
"rewards/margins": 1.3004621267318726,
"rewards/rejected": -2.664175510406494,
"step": 2060
},
{
"epoch": 2.9835149986487703,
"grad_norm": 37.76509475708008,
"learning_rate": 2.8545584319361605e-11,
"logits/chosen": -1.5593769550323486,
"logits/rejected": -1.5454738140106201,
"logps/chosen": -59.01500701904297,
"logps/rejected": -72.01415252685547,
"loss": 0.4342,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.4014034271240234,
"rewards/margins": 1.1465706825256348,
"rewards/rejected": -2.547974109649658,
"step": 2070
},
{
"epoch": 2.9964868029907215,
"step": 2079,
"total_flos": 0.0,
"train_loss": 0.5121967236028949,
"train_runtime": 18760.8926,
"train_samples_per_second": 3.55,
"train_steps_per_second": 0.111
}
],
"logging_steps": 10,
"max_steps": 2079,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}