zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
aa55b9d verified
raw history blame
No virus
50.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9994242947610823,
"eval_steps": 100,
"global_step": 868,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 23.51828299790517,
"learning_rate": 5.747126436781609e-09,
"logits/chosen": -1.865264654159546,
"logits/rejected": -1.587956428527832,
"logps/chosen": -204.58331298828125,
"logps/rejected": -154.1517333984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 24.149515970375678,
"learning_rate": 5.747126436781609e-08,
"logits/chosen": -1.90481698513031,
"logits/rejected": -1.8536584377288818,
"logps/chosen": -213.41416931152344,
"logps/rejected": -191.33694458007812,
"loss": 0.6932,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": -1.9929786503780633e-05,
"rewards/margins": 0.00017105697770603,
"rewards/rejected": -0.00019098672783002257,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 23.563731768256098,
"learning_rate": 1.1494252873563217e-07,
"logits/chosen": -1.9680726528167725,
"logits/rejected": -1.798654317855835,
"logps/chosen": -255.55111694335938,
"logps/rejected": -189.6189727783203,
"loss": 0.6921,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.0016116431215777993,
"rewards/margins": 0.002336590550839901,
"rewards/rejected": -0.0007249473710544407,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 23.038450073297746,
"learning_rate": 1.7241379310344828e-07,
"logits/chosen": -1.8938862085342407,
"logits/rejected": -1.8228662014007568,
"logps/chosen": -212.65322875976562,
"logps/rejected": -194.4668426513672,
"loss": 0.6878,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.011502735316753387,
"rewards/margins": 0.014704583212733269,
"rewards/rejected": -0.003201847430318594,
"step": 30
},
{
"epoch": 0.05,
"grad_norm": 22.339093495440075,
"learning_rate": 2.2988505747126435e-07,
"logits/chosen": -1.8691730499267578,
"logits/rejected": -1.810280442237854,
"logps/chosen": -212.04031372070312,
"logps/rejected": -189.72427368164062,
"loss": 0.6773,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.027534600347280502,
"rewards/margins": 0.037894655019044876,
"rewards/rejected": -0.010360054671764374,
"step": 40
},
{
"epoch": 0.06,
"grad_norm": 21.83120331543706,
"learning_rate": 2.873563218390804e-07,
"logits/chosen": -1.9792773723602295,
"logits/rejected": -1.8856391906738281,
"logps/chosen": -199.00392150878906,
"logps/rejected": -184.42074584960938,
"loss": 0.6637,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.02775971218943596,
"rewards/margins": 0.08295276015996933,
"rewards/rejected": -0.05519305542111397,
"step": 50
},
{
"epoch": 0.07,
"grad_norm": 21.94313336281609,
"learning_rate": 3.4482758620689656e-07,
"logits/chosen": -1.978032112121582,
"logits/rejected": -1.8626216650009155,
"logps/chosen": -263.13702392578125,
"logps/rejected": -227.51931762695312,
"loss": 0.6365,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.024905700236558914,
"rewards/margins": 0.1394185870885849,
"rewards/rejected": -0.1643243134021759,
"step": 60
},
{
"epoch": 0.08,
"grad_norm": 21.93834951114425,
"learning_rate": 4.0229885057471266e-07,
"logits/chosen": -1.923208236694336,
"logits/rejected": -1.9092395305633545,
"logps/chosen": -211.4084930419922,
"logps/rejected": -216.09439086914062,
"loss": 0.6127,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.16204482316970825,
"rewards/margins": 0.21120235323905945,
"rewards/rejected": -0.3732471466064453,
"step": 70
},
{
"epoch": 0.09,
"grad_norm": 26.27963832031748,
"learning_rate": 4.597701149425287e-07,
"logits/chosen": -1.7020299434661865,
"logits/rejected": -1.635000467300415,
"logps/chosen": -229.10562133789062,
"logps/rejected": -228.198486328125,
"loss": 0.5888,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.3274237811565399,
"rewards/margins": 0.26525241136550903,
"rewards/rejected": -0.5926762819290161,
"step": 80
},
{
"epoch": 0.1,
"grad_norm": 35.47456739543052,
"learning_rate": 4.999817969178237e-07,
"logits/chosen": -1.768843412399292,
"logits/rejected": -1.73134446144104,
"logps/chosen": -271.71563720703125,
"logps/rejected": -283.0465393066406,
"loss": 0.5313,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.538571298122406,
"rewards/margins": 0.47389060258865356,
"rewards/rejected": -1.0124619007110596,
"step": 90
},
{
"epoch": 0.12,
"grad_norm": 38.67050237438448,
"learning_rate": 4.996582603056428e-07,
"logits/chosen": -1.7260372638702393,
"logits/rejected": -1.6588356494903564,
"logps/chosen": -285.2041320800781,
"logps/rejected": -323.65692138671875,
"loss": 0.5405,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6892239451408386,
"rewards/margins": 0.5662633180618286,
"rewards/rejected": -1.2554872035980225,
"step": 100
},
{
"epoch": 0.12,
"eval_logits/chosen": -1.746153473854065,
"eval_logits/rejected": -1.6546903848648071,
"eval_logps/chosen": -421.5047912597656,
"eval_logps/rejected": -451.7755432128906,
"eval_loss": 0.6086099743843079,
"eval_rewards/accuracies": 0.6953125,
"eval_rewards/chosen": -0.8599321246147156,
"eval_rewards/margins": 0.3267643451690674,
"eval_rewards/rejected": -1.1866965293884277,
"eval_runtime": 98.2501,
"eval_samples_per_second": 20.356,
"eval_steps_per_second": 0.326,
"step": 100
},
{
"epoch": 0.13,
"grad_norm": 56.77623681367674,
"learning_rate": 4.989308132738126e-07,
"logits/chosen": -1.8324391841888428,
"logits/rejected": -1.7346527576446533,
"logps/chosen": -289.9622802734375,
"logps/rejected": -307.9504699707031,
"loss": 0.5032,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7402961850166321,
"rewards/margins": 0.6292544007301331,
"rewards/rejected": -1.3695508241653442,
"step": 110
},
{
"epoch": 0.14,
"grad_norm": 54.65739090602792,
"learning_rate": 4.978006327248536e-07,
"logits/chosen": -1.91842520236969,
"logits/rejected": -1.849988579750061,
"logps/chosen": -323.345703125,
"logps/rejected": -366.32415771484375,
"loss": 0.4966,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.107177972793579,
"rewards/margins": 0.7300722599029541,
"rewards/rejected": -1.8372503519058228,
"step": 120
},
{
"epoch": 0.15,
"grad_norm": 40.66462467188264,
"learning_rate": 4.962695471250032e-07,
"logits/chosen": -1.7266982793807983,
"logits/rejected": -1.6543283462524414,
"logps/chosen": -320.31195068359375,
"logps/rejected": -359.983154296875,
"loss": 0.4886,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0283275842666626,
"rewards/margins": 0.7512324452400208,
"rewards/rejected": -1.7795600891113281,
"step": 130
},
{
"epoch": 0.16,
"grad_norm": 45.88018498600559,
"learning_rate": 4.94340033546025e-07,
"logits/chosen": -1.4110041856765747,
"logits/rejected": -1.3973127603530884,
"logps/chosen": -312.18145751953125,
"logps/rejected": -390.5517578125,
"loss": 0.4739,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.310011863708496,
"rewards/margins": 0.8049423098564148,
"rewards/rejected": -2.1149544715881348,
"step": 140
},
{
"epoch": 0.17,
"grad_norm": 79.78754356153908,
"learning_rate": 4.920152136576705e-07,
"logits/chosen": -1.2265546321868896,
"logits/rejected": -1.1716219186782837,
"logps/chosen": -357.737060546875,
"logps/rejected": -431.76806640625,
"loss": 0.4655,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4461175203323364,
"rewards/margins": 0.9848885536193848,
"rewards/rejected": -2.4310059547424316,
"step": 150
},
{
"epoch": 0.18,
"grad_norm": 40.08268655919122,
"learning_rate": 4.892988486772756e-07,
"logits/chosen": -1.2588635683059692,
"logits/rejected": -1.1425318717956543,
"logps/chosen": -354.57867431640625,
"logps/rejected": -432.987060546875,
"loss": 0.4787,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4544165134429932,
"rewards/margins": 0.9601584672927856,
"rewards/rejected": -2.4145748615264893,
"step": 160
},
{
"epoch": 0.2,
"grad_norm": 37.08844280081501,
"learning_rate": 4.861953332846629e-07,
"logits/chosen": -1.0948612689971924,
"logits/rejected": -0.9797511100769043,
"logps/chosen": -370.5609436035156,
"logps/rejected": -417.10418701171875,
"loss": 0.4741,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4464932680130005,
"rewards/margins": 0.8114526867866516,
"rewards/rejected": -2.257946014404297,
"step": 170
},
{
"epoch": 0.21,
"grad_norm": 51.03369267010431,
"learning_rate": 4.827096885121953e-07,
"logits/chosen": -0.9882611036300659,
"logits/rejected": -0.786241888999939,
"logps/chosen": -403.01361083984375,
"logps/rejected": -465.450439453125,
"loss": 0.4518,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.6873054504394531,
"rewards/margins": 0.8884965181350708,
"rewards/rejected": -2.5758020877838135,
"step": 180
},
{
"epoch": 0.22,
"grad_norm": 40.75117386512369,
"learning_rate": 4.788475536214821e-07,
"logits/chosen": -0.6994659900665283,
"logits/rejected": -0.57302325963974,
"logps/chosen": -345.23858642578125,
"logps/rejected": -434.90069580078125,
"loss": 0.4305,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.5364990234375,
"rewards/margins": 1.0722037553787231,
"rewards/rejected": -2.6087028980255127,
"step": 190
},
{
"epoch": 0.23,
"grad_norm": 50.385160508667006,
"learning_rate": 4.746151769798818e-07,
"logits/chosen": -0.46505388617515564,
"logits/rejected": -0.32105451822280884,
"logps/chosen": -395.0636901855469,
"logps/rejected": -491.369873046875,
"loss": 0.4371,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5998367071151733,
"rewards/margins": 1.3637341260910034,
"rewards/rejected": -2.9635708332061768,
"step": 200
},
{
"epoch": 0.23,
"eval_logits/chosen": -0.8866692185401917,
"eval_logits/rejected": -0.715141236782074,
"eval_logps/chosen": -537.5919799804688,
"eval_logps/rejected": -591.529052734375,
"eval_loss": 0.5454351305961609,
"eval_rewards/accuracies": 0.7421875,
"eval_rewards/chosen": -2.0208044052124023,
"eval_rewards/margins": 0.5634276270866394,
"eval_rewards/rejected": -2.5842318534851074,
"eval_runtime": 98.1521,
"eval_samples_per_second": 20.377,
"eval_steps_per_second": 0.326,
"step": 200
},
{
"epoch": 0.24,
"grad_norm": 44.17462139523744,
"learning_rate": 4.7001940595156055e-07,
"logits/chosen": -0.5879951119422913,
"logits/rejected": -0.31766843795776367,
"logps/chosen": -347.45184326171875,
"logps/rejected": -442.23291015625,
"loss": 0.466,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.621807336807251,
"rewards/margins": 1.1228187084197998,
"rewards/rejected": -2.7446258068084717,
"step": 210
},
{
"epoch": 0.25,
"grad_norm": 46.80720748583798,
"learning_rate": 4.650676758194623e-07,
"logits/chosen": -0.5494168996810913,
"logits/rejected": -0.3329974114894867,
"logps/chosen": -386.22528076171875,
"logps/rejected": -472.072998046875,
"loss": 0.419,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.6599994897842407,
"rewards/margins": 1.2505383491516113,
"rewards/rejected": -2.9105377197265625,
"step": 220
},
{
"epoch": 0.26,
"grad_norm": 43.28959440159286,
"learning_rate": 4.5976799775611215e-07,
"logits/chosen": -0.6910772919654846,
"logits/rejected": -0.4287993013858795,
"logps/chosen": -385.10784912109375,
"logps/rejected": -484.22314453125,
"loss": 0.43,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.7417128086090088,
"rewards/margins": 1.4360835552215576,
"rewards/rejected": -3.1777961254119873,
"step": 230
},
{
"epoch": 0.28,
"grad_norm": 48.21494711877692,
"learning_rate": 4.5412894586271543e-07,
"logits/chosen": -0.3966357111930847,
"logits/rejected": -0.13579869270324707,
"logps/chosen": -405.3009338378906,
"logps/rejected": -484.6737365722656,
"loss": 0.4083,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8974357843399048,
"rewards/margins": 1.3567252159118652,
"rewards/rejected": -3.2541611194610596,
"step": 240
},
{
"epoch": 0.29,
"grad_norm": 42.352515667816355,
"learning_rate": 4.481596432975201e-07,
"logits/chosen": -0.6702763438224792,
"logits/rejected": -0.49778255820274353,
"logps/chosen": -340.3480224609375,
"logps/rejected": -434.61376953125,
"loss": 0.425,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6621681451797485,
"rewards/margins": 1.0998741388320923,
"rewards/rejected": -2.762042284011841,
"step": 250
},
{
"epoch": 0.3,
"grad_norm": 51.54256095538614,
"learning_rate": 4.41869747515886e-07,
"logits/chosen": -0.6597603559494019,
"logits/rejected": -0.5498248338699341,
"logps/chosen": -365.7995910644531,
"logps/rejected": -490.1622009277344,
"loss": 0.4244,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4162827730178833,
"rewards/margins": 1.2882452011108398,
"rewards/rejected": -2.7045278549194336,
"step": 260
},
{
"epoch": 0.31,
"grad_norm": 48.71803198385668,
"learning_rate": 4.352694346459396e-07,
"logits/chosen": 0.04401933029294014,
"logits/rejected": 0.16322588920593262,
"logps/chosen": -363.21539306640625,
"logps/rejected": -463.6495056152344,
"loss": 0.4206,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.5739765167236328,
"rewards/margins": 1.1849424839019775,
"rewards/rejected": -2.7589190006256104,
"step": 270
},
{
"epoch": 0.32,
"grad_norm": 38.68223370724194,
"learning_rate": 4.2836938302509256e-07,
"logits/chosen": -0.13973233103752136,
"logits/rejected": 0.19283699989318848,
"logps/chosen": -328.5007019042969,
"logps/rejected": -440.18365478515625,
"loss": 0.4456,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.279756784439087,
"rewards/margins": 1.4430491924285889,
"rewards/rejected": -2.7228057384490967,
"step": 280
},
{
"epoch": 0.33,
"grad_norm": 45.704934038680605,
"learning_rate": 4.2118075592405874e-07,
"logits/chosen": 0.20580144226551056,
"logits/rejected": 0.34621715545654297,
"logps/chosen": -407.57373046875,
"logps/rejected": -517.0430908203125,
"loss": 0.4242,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.8687858581542969,
"rewards/margins": 1.2867904901504517,
"rewards/rejected": -3.155576229095459,
"step": 290
},
{
"epoch": 0.35,
"grad_norm": 48.006993514366904,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": 0.6578917503356934,
"logits/rejected": 0.7554408311843872,
"logps/chosen": -349.4103088378906,
"logps/rejected": -480.834228515625,
"loss": 0.4348,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.828386664390564,
"rewards/margins": 1.3594980239868164,
"rewards/rejected": -3.18788480758667,
"step": 300
},
{
"epoch": 0.35,
"eval_logits/chosen": -0.5939264297485352,
"eval_logits/rejected": -0.34991100430488586,
"eval_logps/chosen": -545.4883422851562,
"eval_logps/rejected": -617.2100830078125,
"eval_loss": 0.5011798739433289,
"eval_rewards/accuracies": 0.7734375,
"eval_rewards/chosen": -2.0997684001922607,
"eval_rewards/margins": 0.7412738800048828,
"eval_rewards/rejected": -2.8410420417785645,
"eval_runtime": 98.127,
"eval_samples_per_second": 20.382,
"eval_steps_per_second": 0.326,
"step": 300
},
{
"epoch": 0.36,
"grad_norm": 43.38987414729455,
"learning_rate": 4.059847439122671e-07,
"logits/chosen": 0.5874438285827637,
"logits/rejected": 0.8824877738952637,
"logps/chosen": -419.9178771972656,
"logps/rejected": -517.2019653320312,
"loss": 0.4149,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.0750081539154053,
"rewards/margins": 1.2572228908538818,
"rewards/rejected": -3.332231044769287,
"step": 310
},
{
"epoch": 0.37,
"grad_norm": 56.605050092804255,
"learning_rate": 3.98001943918432e-07,
"logits/chosen": 0.6735237836837769,
"logits/rejected": 1.019078254699707,
"logps/chosen": -373.03009033203125,
"logps/rejected": -483.0083923339844,
"loss": 0.4049,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.7667083740234375,
"rewards/margins": 1.1942052841186523,
"rewards/rejected": -2.960913896560669,
"step": 320
},
{
"epoch": 0.38,
"grad_norm": 57.81664075376147,
"learning_rate": 3.8977969850346866e-07,
"logits/chosen": 0.4839138090610504,
"logits/rejected": 0.8274878263473511,
"logps/chosen": -387.33673095703125,
"logps/rejected": -499.78094482421875,
"loss": 0.4004,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7052650451660156,
"rewards/margins": 1.477137565612793,
"rewards/rejected": -3.1824028491973877,
"step": 330
},
{
"epoch": 0.39,
"grad_norm": 50.66567087546677,
"learning_rate": 3.8133131005357465e-07,
"logits/chosen": 0.23904335498809814,
"logits/rejected": 0.6436888575553894,
"logps/chosen": -374.50750732421875,
"logps/rejected": -534.21435546875,
"loss": 0.3943,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.705120325088501,
"rewards/margins": 1.7923282384872437,
"rewards/rejected": -3.497448444366455,
"step": 340
},
{
"epoch": 0.4,
"grad_norm": 41.43510772615216,
"learning_rate": 3.7267044682118435e-07,
"logits/chosen": 0.3483354449272156,
"logits/rejected": 0.6899020075798035,
"logps/chosen": -369.47418212890625,
"logps/rejected": -496.38262939453125,
"loss": 0.3884,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.8410135507583618,
"rewards/margins": 1.4833061695098877,
"rewards/rejected": -3.324319362640381,
"step": 350
},
{
"epoch": 0.41,
"grad_norm": 46.89248795203356,
"learning_rate": 3.638111208117425e-07,
"logits/chosen": 0.22267869114875793,
"logits/rejected": 0.4508979916572571,
"logps/chosen": -409.98974609375,
"logps/rejected": -508.88055419921875,
"loss": 0.4111,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0787599086761475,
"rewards/margins": 1.0934727191925049,
"rewards/rejected": -3.1722328662872314,
"step": 360
},
{
"epoch": 0.43,
"grad_norm": 43.02323311612351,
"learning_rate": 3.5476766511433605e-07,
"logits/chosen": 0.1800430715084076,
"logits/rejected": 0.6425480842590332,
"logps/chosen": -431.10736083984375,
"logps/rejected": -516.4458618164062,
"loss": 0.4194,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9750921726226807,
"rewards/margins": 1.3207170963287354,
"rewards/rejected": -3.295809268951416,
"step": 370
},
{
"epoch": 0.44,
"grad_norm": 43.154999607698095,
"learning_rate": 3.455547107128602e-07,
"logits/chosen": 0.3740110993385315,
"logits/rejected": 0.8220480680465698,
"logps/chosen": -410.6556701660156,
"logps/rejected": -515.9549560546875,
"loss": 0.3767,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.596968412399292,
"rewards/margins": 1.6267616748809814,
"rewards/rejected": -3.2237300872802734,
"step": 380
},
{
"epoch": 0.45,
"grad_norm": 56.90068596534485,
"learning_rate": 3.361871628152338e-07,
"logits/chosen": 0.6576219797134399,
"logits/rejected": 1.0373657941818237,
"logps/chosen": -398.47906494140625,
"logps/rejected": -556.7415771484375,
"loss": 0.4239,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9783694744110107,
"rewards/margins": 1.5746887922286987,
"rewards/rejected": -3.55305814743042,
"step": 390
},
{
"epoch": 0.46,
"grad_norm": 41.49097538770333,
"learning_rate": 3.2668017673896077e-07,
"logits/chosen": 0.6066378355026245,
"logits/rejected": 1.0441324710845947,
"logps/chosen": -376.2064514160156,
"logps/rejected": -497.462890625,
"loss": 0.3733,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.7407310009002686,
"rewards/margins": 1.581956148147583,
"rewards/rejected": -3.3226871490478516,
"step": 400
},
{
"epoch": 0.46,
"eval_logits/chosen": -0.5456388592720032,
"eval_logits/rejected": -0.2280205935239792,
"eval_logps/chosen": -550.5716552734375,
"eval_logps/rejected": -626.190185546875,
"eval_loss": 0.47210657596588135,
"eval_rewards/accuracies": 0.77734375,
"eval_rewards/chosen": -2.1506011486053467,
"eval_rewards/margins": 0.7802413105964661,
"eval_rewards/rejected": -2.930842399597168,
"eval_runtime": 98.1161,
"eval_samples_per_second": 20.384,
"eval_steps_per_second": 0.326,
"step": 400
},
{
"epoch": 0.47,
"grad_norm": 47.55353494901972,
"learning_rate": 3.1704913339205103e-07,
"logits/chosen": 0.5084329843521118,
"logits/rejected": 0.796318531036377,
"logps/chosen": -409.43585205078125,
"logps/rejected": -561.5556030273438,
"loss": 0.3928,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.992550253868103,
"rewards/margins": 1.6422802209854126,
"rewards/rejected": -3.6348299980163574,
"step": 410
},
{
"epoch": 0.48,
"grad_norm": 41.646877730648264,
"learning_rate": 3.0730961438896885e-07,
"logits/chosen": 0.4776241183280945,
"logits/rejected": 0.7627217769622803,
"logps/chosen": -482.1835021972656,
"logps/rejected": -587.5792236328125,
"loss": 0.3881,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.093543767929077,
"rewards/margins": 1.4904192686080933,
"rewards/rejected": -3.583962917327881,
"step": 420
},
{
"epoch": 0.5,
"grad_norm": 68.32669660083764,
"learning_rate": 2.9747737684186795e-07,
"logits/chosen": 0.7197389602661133,
"logits/rejected": 0.8317638635635376,
"logps/chosen": -388.28656005859375,
"logps/rejected": -509.2151794433594,
"loss": 0.3841,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7809364795684814,
"rewards/margins": 1.5095723867416382,
"rewards/rejected": -3.290508985519409,
"step": 430
},
{
"epoch": 0.51,
"grad_norm": 46.78192200543751,
"learning_rate": 2.8756832786789663e-07,
"logits/chosen": 0.3376988172531128,
"logits/rejected": 0.8295138478279114,
"logps/chosen": -403.0928649902344,
"logps/rejected": -518.611083984375,
"loss": 0.4029,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.744091272354126,
"rewards/margins": 1.5630067586898804,
"rewards/rejected": -3.307097911834717,
"step": 440
},
{
"epoch": 0.52,
"grad_norm": 49.72034219777285,
"learning_rate": 2.7759849885381747e-07,
"logits/chosen": 0.3917238414287567,
"logits/rejected": 0.9007431268692017,
"logps/chosen": -451.806884765625,
"logps/rejected": -584.4218139648438,
"loss": 0.3785,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.0996882915496826,
"rewards/margins": 1.9295704364776611,
"rewards/rejected": -4.029258728027344,
"step": 450
},
{
"epoch": 0.53,
"grad_norm": 38.3046078852496,
"learning_rate": 2.675840195195762e-07,
"logits/chosen": 0.1938302218914032,
"logits/rejected": 0.7046247720718384,
"logps/chosen": -375.27606201171875,
"logps/rejected": -523.9801025390625,
"loss": 0.3934,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.7863012552261353,
"rewards/margins": 1.6471843719482422,
"rewards/rejected": -3.433485507965088,
"step": 460
},
{
"epoch": 0.54,
"grad_norm": 39.056692194028,
"learning_rate": 2.575410918227829e-07,
"logits/chosen": 0.09105312079191208,
"logits/rejected": 0.5196784138679504,
"logps/chosen": -413.9867248535156,
"logps/rejected": -532.4803466796875,
"loss": 0.3755,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.720029592514038,
"rewards/margins": 1.528271198272705,
"rewards/rejected": -3.2483010292053223,
"step": 470
},
{
"epoch": 0.55,
"grad_norm": 46.6868254294557,
"learning_rate": 2.474859637463226e-07,
"logits/chosen": 0.21693472564220428,
"logits/rejected": 0.8155421018600464,
"logps/chosen": -418.37652587890625,
"logps/rejected": -540.866455078125,
"loss": 0.3846,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9778916835784912,
"rewards/margins": 1.7564996480941772,
"rewards/rejected": -3.734391450881958,
"step": 480
},
{
"epoch": 0.56,
"grad_norm": 45.512117273870444,
"learning_rate": 2.3743490301150355e-07,
"logits/chosen": 0.2570355236530304,
"logits/rejected": 0.8997817039489746,
"logps/chosen": -381.27801513671875,
"logps/rejected": -525.5377807617188,
"loss": 0.4012,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.6076080799102783,
"rewards/margins": 1.8304884433746338,
"rewards/rejected": -3.438096523284912,
"step": 490
},
{
"epoch": 0.58,
"grad_norm": 46.09704078060399,
"learning_rate": 2.274041707592724e-07,
"logits/chosen": 0.7786660194396973,
"logits/rejected": 1.2057403326034546,
"logps/chosen": -416.14068603515625,
"logps/rejected": -602.9859008789062,
"loss": 0.3689,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.2852025032043457,
"rewards/margins": 1.9095999002456665,
"rewards/rejected": -4.194802284240723,
"step": 500
},
{
"epoch": 0.58,
"eval_logits/chosen": -0.4774431586265564,
"eval_logits/rejected": -0.1090613454580307,
"eval_logps/chosen": -540.1826171875,
"eval_logps/rejected": -627.9595336914062,
"eval_loss": 0.448412150144577,
"eval_rewards/accuracies": 0.796875,
"eval_rewards/chosen": -2.046710968017578,
"eval_rewards/margins": 0.9018256068229675,
"eval_rewards/rejected": -2.9485368728637695,
"eval_runtime": 98.1848,
"eval_samples_per_second": 20.37,
"eval_steps_per_second": 0.326,
"step": 500
},
{
"epoch": 0.59,
"grad_norm": 42.744213876119844,
"learning_rate": 2.17409995242075e-07,
"logits/chosen": 0.6994825005531311,
"logits/rejected": 1.289393663406372,
"logps/chosen": -405.2342224121094,
"logps/rejected": -555.2643432617188,
"loss": 0.3921,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -2.062455415725708,
"rewards/margins": 1.8831449747085571,
"rewards/rejected": -3.9456000328063965,
"step": 510
},
{
"epoch": 0.6,
"grad_norm": 44.25862131066792,
"learning_rate": 2.0746854556892544e-07,
"logits/chosen": 0.7421714067459106,
"logits/rejected": 0.9166728258132935,
"logps/chosen": -363.72222900390625,
"logps/rejected": -499.4908752441406,
"loss": 0.4102,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.795539140701294,
"rewards/margins": 1.4331713914871216,
"rewards/rejected": -3.228710889816284,
"step": 520
},
{
"epoch": 0.61,
"grad_norm": 40.42456029676201,
"learning_rate": 1.9759590554616173e-07,
"logits/chosen": 0.2788628935813904,
"logits/rejected": 0.5978427529335022,
"logps/chosen": -387.8989562988281,
"logps/rejected": -499.9576110839844,
"loss": 0.4053,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.6805702447891235,
"rewards/margins": 1.3731516599655151,
"rewards/rejected": -3.0537219047546387,
"step": 530
},
{
"epoch": 0.62,
"grad_norm": 43.79592437572997,
"learning_rate": 1.8780804765620746e-07,
"logits/chosen": 0.37570881843566895,
"logits/rejected": 0.5200439691543579,
"logps/chosen": -394.23284912109375,
"logps/rejected": -548.2333374023438,
"loss": 0.384,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5941615104675293,
"rewards/margins": 1.527552843093872,
"rewards/rejected": -3.1217141151428223,
"step": 540
},
{
"epoch": 0.63,
"grad_norm": 44.94669101797897,
"learning_rate": 1.7812080721643973e-07,
"logits/chosen": 0.6379637122154236,
"logits/rejected": 1.1335102319717407,
"logps/chosen": -422.62200927734375,
"logps/rejected": -535.2354736328125,
"loss": 0.3932,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -2.0112552642822266,
"rewards/margins": 1.6570736169815063,
"rewards/rejected": -3.6683287620544434,
"step": 550
},
{
"epoch": 0.64,
"grad_norm": 48.51576878403802,
"learning_rate": 1.6854985675997063e-07,
"logits/chosen": 0.5151522755622864,
"logits/rejected": 0.9227844476699829,
"logps/chosen": -410.75244140625,
"logps/rejected": -543.8304443359375,
"loss": 0.3729,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.9362386465072632,
"rewards/margins": 1.548099398612976,
"rewards/rejected": -3.4843380451202393,
"step": 560
},
{
"epoch": 0.66,
"grad_norm": 42.77055197730572,
"learning_rate": 1.5911068067978818e-07,
"logits/chosen": 0.7765737771987915,
"logits/rejected": 0.9592781066894531,
"logps/chosen": -391.6842041015625,
"logps/rejected": -575.3435668945312,
"loss": 0.3642,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.0532357692718506,
"rewards/margins": 1.811832070350647,
"rewards/rejected": -3.865067720413208,
"step": 570
},
{
"epoch": 0.67,
"grad_norm": 51.09604434640814,
"learning_rate": 1.4981855017728197e-07,
"logits/chosen": 0.596177875995636,
"logits/rejected": 0.7803729772567749,
"logps/chosen": -459.51422119140625,
"logps/rejected": -612.7260131835938,
"loss": 0.388,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.421908140182495,
"rewards/margins": 1.5485522747039795,
"rewards/rejected": -3.9704601764678955,
"step": 580
},
{
"epoch": 0.68,
"grad_norm": 51.69715596466598,
"learning_rate": 1.406884985556804e-07,
"logits/chosen": 0.6335197687149048,
"logits/rejected": 1.1092630624771118,
"logps/chosen": -429.76690673828125,
"logps/rejected": -580.2468872070312,
"loss": 0.3807,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -2.2301127910614014,
"rewards/margins": 1.8223087787628174,
"rewards/rejected": -4.052420616149902,
"step": 590
},
{
"epoch": 0.69,
"grad_norm": 48.435911535292384,
"learning_rate": 1.3173529689837354e-07,
"logits/chosen": 0.5912660956382751,
"logits/rejected": 1.1899088621139526,
"logps/chosen": -393.476318359375,
"logps/rejected": -521.782958984375,
"loss": 0.3829,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8528053760528564,
"rewards/margins": 1.6730989217758179,
"rewards/rejected": -3.5259041786193848,
"step": 600
},
{
"epoch": 0.69,
"eval_logits/chosen": -0.509851336479187,
"eval_logits/rejected": -0.14121857285499573,
"eval_logps/chosen": -538.1624145507812,
"eval_logps/rejected": -623.8541259765625,
"eval_loss": 0.44193577766418457,
"eval_rewards/accuracies": 0.80859375,
"eval_rewards/chosen": -2.0265088081359863,
"eval_rewards/margins": 0.8809735774993896,
"eval_rewards/rejected": -2.907482147216797,
"eval_runtime": 98.167,
"eval_samples_per_second": 20.373,
"eval_steps_per_second": 0.326,
"step": 600
},
{
"epoch": 0.7,
"grad_norm": 48.985755457205066,
"learning_rate": 1.2297343017146726e-07,
"logits/chosen": 0.7694305181503296,
"logits/rejected": 1.232879877090454,
"logps/chosen": -402.1836853027344,
"logps/rejected": -533.408447265625,
"loss": 0.3929,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.905542016029358,
"rewards/margins": 1.6221548318862915,
"rewards/rejected": -3.5276970863342285,
"step": 610
},
{
"epoch": 0.71,
"grad_norm": 45.28513242475784,
"learning_rate": 1.1441707378923474e-07,
"logits/chosen": 0.5253760814666748,
"logits/rejected": 1.0413273572921753,
"logps/chosen": -359.5643615722656,
"logps/rejected": -514.2081909179688,
"loss": 0.3806,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.579487919807434,
"rewards/margins": 1.767327070236206,
"rewards/rejected": -3.3468146324157715,
"step": 620
},
{
"epoch": 0.73,
"grad_norm": 47.72652227607087,
"learning_rate": 1.06080070680377e-07,
"logits/chosen": 0.4920094907283783,
"logits/rejected": 1.009433627128601,
"logps/chosen": -399.2576599121094,
"logps/rejected": -537.9578247070312,
"loss": 0.3821,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.7137682437896729,
"rewards/margins": 1.7276941537857056,
"rewards/rejected": -3.441462755203247,
"step": 630
},
{
"epoch": 0.74,
"grad_norm": 42.1168430015071,
"learning_rate": 9.797590889219587e-08,
"logits/chosen": 0.3111940026283264,
"logits/rejected": 0.8665814399719238,
"logps/chosen": -396.842529296875,
"logps/rejected": -543.9876098632812,
"loss": 0.3843,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.7231413125991821,
"rewards/margins": 1.8444896936416626,
"rewards/rejected": -3.567631244659424,
"step": 640
},
{
"epoch": 0.75,
"grad_norm": 47.41933670532933,
"learning_rate": 9.011769976891367e-08,
"logits/chosen": 0.4944031834602356,
"logits/rejected": 0.8744715452194214,
"logps/chosen": -398.05615234375,
"logps/rejected": -543.6096801757812,
"loss": 0.3763,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.8718990087509155,
"rewards/margins": 1.6193087100982666,
"rewards/rejected": -3.4912078380584717,
"step": 650
},
{
"epoch": 0.76,
"grad_norm": 52.75260796298546,
"learning_rate": 8.251815673944218e-08,
"logits/chosen": 0.5813334584236145,
"logits/rejected": 0.9786221385002136,
"logps/chosen": -443.66070556640625,
"logps/rejected": -576.3490600585938,
"loss": 0.3822,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.220869779586792,
"rewards/margins": 1.742889404296875,
"rewards/rejected": -3.963758945465088,
"step": 660
},
{
"epoch": 0.77,
"grad_norm": 46.64520061062158,
"learning_rate": 7.518957474892148e-08,
"logits/chosen": 0.6128578186035156,
"logits/rejected": 1.1231881380081177,
"logps/chosen": -427.1106872558594,
"logps/rejected": -589.3102416992188,
"loss": 0.3662,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -2.0986740589141846,
"rewards/margins": 1.9974746704101562,
"rewards/rejected": -4.096148490905762,
"step": 670
},
{
"epoch": 0.78,
"grad_norm": 44.32719204523107,
"learning_rate": 6.814381036730274e-08,
"logits/chosen": 0.44363918900489807,
"logits/rejected": 0.8115978240966797,
"logps/chosen": -397.6707763671875,
"logps/rejected": -538.56591796875,
"loss": 0.3962,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9131567478179932,
"rewards/margins": 1.6610631942749023,
"rewards/rejected": -3.5742194652557373,
"step": 680
},
{
"epoch": 0.79,
"grad_norm": 43.744460103075866,
"learning_rate": 6.139226260715872e-08,
"logits/chosen": 0.34574732184410095,
"logits/rejected": 0.7309020161628723,
"logps/chosen": -390.32464599609375,
"logps/rejected": -550.9197998046875,
"loss": 0.3747,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.7989364862442017,
"rewards/margins": 1.8234875202178955,
"rewards/rejected": -3.622424364089966,
"step": 690
},
{
"epoch": 0.81,
"grad_norm": 48.21671181557863,
"learning_rate": 5.4945854481754734e-08,
"logits/chosen": 0.4160235822200775,
"logits/rejected": 1.0240848064422607,
"logps/chosen": -393.590576171875,
"logps/rejected": -540.9241333007812,
"loss": 0.3725,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.9948371648788452,
"rewards/margins": 1.689173936843872,
"rewards/rejected": -3.6840109825134277,
"step": 700
},
{
"epoch": 0.81,
"eval_logits/chosen": -0.6920372247695923,
"eval_logits/rejected": -0.3223564326763153,
"eval_logps/chosen": -527.349609375,
"eval_logps/rejected": -613.8932495117188,
"eval_loss": 0.43294557929039,
"eval_rewards/accuracies": 0.82421875,
"eval_rewards/chosen": -1.9183804988861084,
"eval_rewards/margins": 0.8894931077957153,
"eval_rewards/rejected": -2.8078737258911133,
"eval_runtime": 98.1374,
"eval_samples_per_second": 20.38,
"eval_steps_per_second": 0.326,
"step": 700
},
{
"epoch": 0.82,
"grad_norm": 42.53084626680963,
"learning_rate": 4.881501533321605e-08,
"logits/chosen": 0.6980074048042297,
"logits/rejected": 1.0298550128936768,
"logps/chosen": -367.0564880371094,
"logps/rejected": -539.99560546875,
"loss": 0.3547,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.8503217697143555,
"rewards/margins": 1.9031312465667725,
"rewards/rejected": -3.753452777862549,
"step": 710
},
{
"epoch": 0.83,
"grad_norm": 43.590506229310456,
"learning_rate": 4.300966395938377e-08,
"logits/chosen": 0.35197392106056213,
"logits/rejected": 0.8350766897201538,
"logps/chosen": -427.9037170410156,
"logps/rejected": -580.8751831054688,
"loss": 0.3788,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.0454230308532715,
"rewards/margins": 1.8579833507537842,
"rewards/rejected": -3.9034061431884766,
"step": 720
},
{
"epoch": 0.84,
"grad_norm": 47.15415328548373,
"learning_rate": 3.7539192566655246e-08,
"logits/chosen": 0.3688026964664459,
"logits/rejected": 0.7924972772598267,
"logps/chosen": -387.2108459472656,
"logps/rejected": -532.4842529296875,
"loss": 0.3762,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -1.6555856466293335,
"rewards/margins": 1.8027565479278564,
"rewards/rejected": -3.4583423137664795,
"step": 730
},
{
"epoch": 0.85,
"grad_norm": 41.72651096064494,
"learning_rate": 3.24124515747731e-08,
"logits/chosen": 0.4526204466819763,
"logits/rejected": 0.7684503793716431,
"logps/chosen": -406.00042724609375,
"logps/rejected": -571.0294189453125,
"loss": 0.3881,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.0768213272094727,
"rewards/margins": 1.7967207431793213,
"rewards/rejected": -3.8735415935516357,
"step": 740
},
{
"epoch": 0.86,
"grad_norm": 47.004010938683734,
"learning_rate": 2.763773529814506e-08,
"logits/chosen": 0.24592173099517822,
"logits/rejected": 0.5948923826217651,
"logps/chosen": -437.3650817871094,
"logps/rejected": -581.8604125976562,
"loss": 0.3772,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.974538803100586,
"rewards/margins": 1.9196981191635132,
"rewards/rejected": -3.8942363262176514,
"step": 750
},
{
"epoch": 0.88,
"grad_norm": 56.33205281532714,
"learning_rate": 2.3222768526860698e-08,
"logits/chosen": 0.2990577220916748,
"logits/rejected": 0.7854124903678894,
"logps/chosen": -404.5032653808594,
"logps/rejected": -561.688720703125,
"loss": 0.3938,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.9624149799346924,
"rewards/margins": 1.855332374572754,
"rewards/rejected": -3.8177475929260254,
"step": 760
},
{
"epoch": 0.89,
"grad_norm": 43.51724396608159,
"learning_rate": 1.9174694029115146e-08,
"logits/chosen": 0.18542930483818054,
"logits/rejected": 0.5257433652877808,
"logps/chosen": -424.1546325683594,
"logps/rejected": -532.9678344726562,
"loss": 0.3879,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.8116706609725952,
"rewards/margins": 1.6489944458007812,
"rewards/rejected": -3.460665225982666,
"step": 770
},
{
"epoch": 0.9,
"grad_norm": 51.298202533295,
"learning_rate": 1.5500060995258134e-08,
"logits/chosen": 0.3892073333263397,
"logits/rejected": 0.8499504327774048,
"logps/chosen": -402.9557189941406,
"logps/rejected": -541.4577026367188,
"loss": 0.349,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -1.8981506824493408,
"rewards/margins": 1.7807424068450928,
"rewards/rejected": -3.6788933277130127,
"step": 780
},
{
"epoch": 0.91,
"grad_norm": 56.6017962844276,
"learning_rate": 1.2204814442165812e-08,
"logits/chosen": 0.3551040589809418,
"logits/rejected": 0.8326929807662964,
"logps/chosen": -402.6451416015625,
"logps/rejected": -552.5445556640625,
"loss": 0.386,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -1.9424476623535156,
"rewards/margins": 1.9533637762069702,
"rewards/rejected": -3.8958117961883545,
"step": 790
},
{
"epoch": 0.92,
"grad_norm": 49.36333315496645,
"learning_rate": 9.294285595075669e-09,
"logits/chosen": 0.06378497928380966,
"logits/rejected": 0.5464959144592285,
"logps/chosen": -430.5462951660156,
"logps/rejected": -562.2453002929688,
"loss": 0.4052,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9655787944793701,
"rewards/margins": 1.7598493099212646,
"rewards/rejected": -3.7254281044006348,
"step": 800
},
{
"epoch": 0.92,
"eval_logits/chosen": -0.8024855852127075,
"eval_logits/rejected": -0.4436371624469757,
"eval_logps/chosen": -524.2042236328125,
"eval_logps/rejected": -612.2493286132812,
"eval_loss": 0.42916327714920044,
"eval_rewards/accuracies": 0.82421875,
"eval_rewards/chosen": -1.8869271278381348,
"eval_rewards/margins": 0.9045072793960571,
"eval_rewards/rejected": -2.7914342880249023,
"eval_runtime": 98.1154,
"eval_samples_per_second": 20.384,
"eval_steps_per_second": 0.326,
"step": 800
},
{
"epoch": 0.93,
"grad_norm": 48.45659164140374,
"learning_rate": 6.773183262446914e-09,
"logits/chosen": 0.2793930172920227,
"logits/rejected": 0.8751212954521179,
"logps/chosen": -400.6767883300781,
"logps/rejected": -544.5294799804688,
"loss": 0.3882,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.9480419158935547,
"rewards/margins": 1.711806297302246,
"rewards/rejected": -3.65984845161438,
"step": 810
},
{
"epoch": 0.94,
"grad_norm": 48.21463789648397,
"learning_rate": 4.645586217799452e-09,
"logits/chosen": 0.24326184391975403,
"logits/rejected": 0.6566700339317322,
"logps/chosen": -410.050537109375,
"logps/rejected": -576.2342529296875,
"loss": 0.4036,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.893699049949646,
"rewards/margins": 1.8373210430145264,
"rewards/rejected": -3.731020450592041,
"step": 820
},
{
"epoch": 0.96,
"grad_norm": 52.40196558130504,
"learning_rate": 2.9149366008568987e-09,
"logits/chosen": 0.2516610622406006,
"logits/rejected": 0.6028949022293091,
"logps/chosen": -397.42755126953125,
"logps/rejected": -558.4515380859375,
"loss": 0.3856,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.790833830833435,
"rewards/margins": 1.8916391134262085,
"rewards/rejected": -3.6824734210968018,
"step": 830
},
{
"epoch": 0.97,
"grad_norm": 45.18885600860689,
"learning_rate": 1.5840343486700215e-09,
"logits/chosen": 0.011555513367056847,
"logits/rejected": 0.5860650539398193,
"logps/chosen": -406.7879638671875,
"logps/rejected": -555.0967407226562,
"loss": 0.3728,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -1.6412866115570068,
"rewards/margins": 1.9584299325942993,
"rewards/rejected": -3.5997166633605957,
"step": 840
},
{
"epoch": 0.98,
"grad_norm": 45.90265978309936,
"learning_rate": 6.550326657293881e-10,
"logits/chosen": 0.08577422052621841,
"logits/rejected": 0.549113929271698,
"logps/chosen": -403.1221618652344,
"logps/rejected": -571.7515869140625,
"loss": 0.3525,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.8727197647094727,
"rewards/margins": 2.0556139945983887,
"rewards/rejected": -3.9283337593078613,
"step": 850
},
{
"epoch": 0.99,
"grad_norm": 48.04876217861222,
"learning_rate": 1.2943454039654467e-10,
"logits/chosen": 0.5522348284721375,
"logits/rejected": 0.82818204164505,
"logps/chosen": -399.8492126464844,
"logps/rejected": -529.6903076171875,
"loss": 0.3623,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -2.0089919567108154,
"rewards/margins": 1.499420404434204,
"rewards/rejected": -3.5084125995635986,
"step": 860
},
{
"epoch": 1.0,
"step": 868,
"total_flos": 0.0,
"train_loss": 0.42912535238925215,
"train_runtime": 13911.1927,
"train_samples_per_second": 7.989,
"train_steps_per_second": 0.062
}
],
"logging_steps": 10,
"max_steps": 868,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}