martimfasantos's picture
Model save
d9e1b4d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 4164,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007204610951008645,
"grad_norm": 14.58157977905889,
"learning_rate": 1.199040767386091e-10,
"logits/chosen": -1.901450514793396,
"logits/rejected": -1.9076323509216309,
"logps/chosen": -0.8524526953697205,
"logps/rejected": -0.9626365900039673,
"loss": 1.1927,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.704905390739441,
"rewards/margins": 0.22036786377429962,
"rewards/rejected": -1.9252731800079346,
"step": 1
},
{
"epoch": 0.007204610951008645,
"grad_norm": 17.76736608782741,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -2.0206170082092285,
"logits/rejected": -2.0063347816467285,
"logps/chosen": -1.0049196481704712,
"logps/rejected": -1.1093952655792236,
"loss": 1.2168,
"rewards/accuracies": 0.5208333134651184,
"rewards/chosen": -2.0098392963409424,
"rewards/margins": 0.2089509218931198,
"rewards/rejected": -2.2187905311584473,
"step": 10
},
{
"epoch": 0.01440922190201729,
"grad_norm": 22.614753087644292,
"learning_rate": 2.398081534772182e-09,
"logits/chosen": -2.026459217071533,
"logits/rejected": -2.0231809616088867,
"logps/chosen": -1.051859736442566,
"logps/rejected": -1.1832743883132935,
"loss": 1.1863,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.103719472885132,
"rewards/margins": 0.2628290057182312,
"rewards/rejected": -2.366548776626587,
"step": 20
},
{
"epoch": 0.021613832853025938,
"grad_norm": 17.824346372572926,
"learning_rate": 3.597122302158273e-09,
"logits/chosen": -1.981697678565979,
"logits/rejected": -1.9744222164154053,
"logps/chosen": -1.053879976272583,
"logps/rejected": -1.1511423587799072,
"loss": 1.2353,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.107759952545166,
"rewards/margins": 0.19452433288097382,
"rewards/rejected": -2.3022847175598145,
"step": 30
},
{
"epoch": 0.02881844380403458,
"grad_norm": 19.247706292689507,
"learning_rate": 4.796163069544364e-09,
"logits/chosen": -2.0287587642669678,
"logits/rejected": -2.028596878051758,
"logps/chosen": -1.0359481573104858,
"logps/rejected": -1.1375384330749512,
"loss": 1.2355,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0718963146209717,
"rewards/margins": 0.20318038761615753,
"rewards/rejected": -2.2750768661499023,
"step": 40
},
{
"epoch": 0.03602305475504323,
"grad_norm": 14.992901360893413,
"learning_rate": 5.995203836930456e-09,
"logits/chosen": -1.962505578994751,
"logits/rejected": -1.9632362127304077,
"logps/chosen": -0.9416370391845703,
"logps/rejected": -1.0078415870666504,
"loss": 1.2545,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8832740783691406,
"rewards/margins": 0.13240887224674225,
"rewards/rejected": -2.015683174133301,
"step": 50
},
{
"epoch": 0.043227665706051875,
"grad_norm": 21.508515110852976,
"learning_rate": 7.194244604316546e-09,
"logits/chosen": -2.0391106605529785,
"logits/rejected": -2.034660816192627,
"logps/chosen": -1.0891697406768799,
"logps/rejected": -1.145775556564331,
"loss": 1.2676,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1783394813537598,
"rewards/margins": 0.11321155726909637,
"rewards/rejected": -2.291551113128662,
"step": 60
},
{
"epoch": 0.05043227665706052,
"grad_norm": 20.688044326046224,
"learning_rate": 8.393285371702639e-09,
"logits/chosen": -2.029348373413086,
"logits/rejected": -2.016831636428833,
"logps/chosen": -1.1090962886810303,
"logps/rejected": -1.204714059829712,
"loss": 1.226,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.2181925773620605,
"rewards/margins": 0.19123554229736328,
"rewards/rejected": -2.409428119659424,
"step": 70
},
{
"epoch": 0.05763688760806916,
"grad_norm": 24.41033214526541,
"learning_rate": 9.592326139088728e-09,
"logits/chosen": -2.046764850616455,
"logits/rejected": -2.043759822845459,
"logps/chosen": -1.166001558303833,
"logps/rejected": -1.237687110900879,
"loss": 1.2535,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.332003116607666,
"rewards/margins": 0.1433708667755127,
"rewards/rejected": -2.475374221801758,
"step": 80
},
{
"epoch": 0.06484149855907781,
"grad_norm": 15.594986746473925,
"learning_rate": 1.0791366906474819e-08,
"logits/chosen": -2.0026838779449463,
"logits/rejected": -2.00419545173645,
"logps/chosen": -1.0416425466537476,
"logps/rejected": -1.148652195930481,
"loss": 1.215,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.083285093307495,
"rewards/margins": 0.2140192985534668,
"rewards/rejected": -2.297304391860962,
"step": 90
},
{
"epoch": 0.07204610951008646,
"grad_norm": 19.00699314951204,
"learning_rate": 1.1990407673860912e-08,
"logits/chosen": -2.040858268737793,
"logits/rejected": -2.0346200466156006,
"logps/chosen": -1.0072879791259766,
"logps/rejected": -1.1140906810760498,
"loss": 1.2176,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.014575958251953,
"rewards/margins": 0.21360567212104797,
"rewards/rejected": -2.2281813621520996,
"step": 100
},
{
"epoch": 0.0792507204610951,
"grad_norm": 16.51858878389513,
"learning_rate": 1.3189448441247003e-08,
"logits/chosen": -1.9792842864990234,
"logits/rejected": -1.9680954217910767,
"logps/chosen": -1.0292143821716309,
"logps/rejected": -1.1284914016723633,
"loss": 1.2285,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0584287643432617,
"rewards/margins": 0.19855372607707977,
"rewards/rejected": -2.2569828033447266,
"step": 110
},
{
"epoch": 0.08645533141210375,
"grad_norm": 18.233760151089655,
"learning_rate": 1.4388489208633092e-08,
"logits/chosen": -1.972887396812439,
"logits/rejected": -1.9710506200790405,
"logps/chosen": -0.9646250009536743,
"logps/rejected": -1.0660240650177002,
"loss": 1.2089,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9292500019073486,
"rewards/margins": 0.20279808342456818,
"rewards/rejected": -2.1320481300354004,
"step": 120
},
{
"epoch": 0.0936599423631124,
"grad_norm": 17.354185009707173,
"learning_rate": 1.5587529976019183e-08,
"logits/chosen": -2.062894105911255,
"logits/rejected": -2.0622401237487793,
"logps/chosen": -1.0803730487823486,
"logps/rejected": -1.1523029804229736,
"loss": 1.2547,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.1607460975646973,
"rewards/margins": 0.14385986328125,
"rewards/rejected": -2.3046059608459473,
"step": 130
},
{
"epoch": 0.10086455331412104,
"grad_norm": 20.84722939621763,
"learning_rate": 1.6786570743405277e-08,
"logits/chosen": -1.9781713485717773,
"logits/rejected": -1.971671462059021,
"logps/chosen": -0.9779410362243652,
"logps/rejected": -1.1225957870483398,
"loss": 1.1689,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9558820724487305,
"rewards/margins": 0.28930944204330444,
"rewards/rejected": -2.2451915740966797,
"step": 140
},
{
"epoch": 0.10806916426512968,
"grad_norm": 20.178997351363016,
"learning_rate": 1.7985611510791365e-08,
"logits/chosen": -1.9949369430541992,
"logits/rejected": -1.990666389465332,
"logps/chosen": -1.0193713903427124,
"logps/rejected": -1.136603593826294,
"loss": 1.2076,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.038742780685425,
"rewards/margins": 0.23446419835090637,
"rewards/rejected": -2.273207187652588,
"step": 150
},
{
"epoch": 0.11527377521613832,
"grad_norm": 17.43558543499963,
"learning_rate": 1.9184652278177456e-08,
"logits/chosen": -2.002195358276367,
"logits/rejected": -1.9960581064224243,
"logps/chosen": -0.948249340057373,
"logps/rejected": -1.0968583822250366,
"loss": 1.1513,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.896498680114746,
"rewards/margins": 0.2972180247306824,
"rewards/rejected": -2.1937167644500732,
"step": 160
},
{
"epoch": 0.12247838616714697,
"grad_norm": 22.73064399272494,
"learning_rate": 2.038369304556355e-08,
"logits/chosen": -2.005837917327881,
"logits/rejected": -1.9983062744140625,
"logps/chosen": -1.0370620489120483,
"logps/rejected": -1.1609737873077393,
"loss": 1.2056,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0741240978240967,
"rewards/margins": 0.24782316386699677,
"rewards/rejected": -2.3219475746154785,
"step": 170
},
{
"epoch": 0.12968299711815562,
"grad_norm": 23.537767016698364,
"learning_rate": 2.1582733812949638e-08,
"logits/chosen": -2.0367612838745117,
"logits/rejected": -2.029956817626953,
"logps/chosen": -1.02077317237854,
"logps/rejected": -1.1086028814315796,
"loss": 1.2477,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.04154634475708,
"rewards/margins": 0.1756592094898224,
"rewards/rejected": -2.217205762863159,
"step": 180
},
{
"epoch": 0.13688760806916425,
"grad_norm": 23.18810653891807,
"learning_rate": 2.278177458033573e-08,
"logits/chosen": -2.077205181121826,
"logits/rejected": -2.0750718116760254,
"logps/chosen": -0.9699970483779907,
"logps/rejected": -1.065187692642212,
"loss": 1.2125,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9399940967559814,
"rewards/margins": 0.19038136303424835,
"rewards/rejected": -2.130375385284424,
"step": 190
},
{
"epoch": 0.1440922190201729,
"grad_norm": 22.445024845318002,
"learning_rate": 2.3980815347721823e-08,
"logits/chosen": -2.0375380516052246,
"logits/rejected": -2.034369945526123,
"logps/chosen": -1.026186227798462,
"logps/rejected": -1.1526433229446411,
"loss": 1.1878,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.052372455596924,
"rewards/margins": 0.2529138922691345,
"rewards/rejected": -2.3052866458892822,
"step": 200
},
{
"epoch": 0.15129682997118155,
"grad_norm": 21.106523936582494,
"learning_rate": 2.517985611510791e-08,
"logits/chosen": -2.036905288696289,
"logits/rejected": -2.0340917110443115,
"logps/chosen": -1.073853611946106,
"logps/rejected": -1.150638461112976,
"loss": 1.2507,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.147707223892212,
"rewards/margins": 0.1535697877407074,
"rewards/rejected": -2.301276922225952,
"step": 210
},
{
"epoch": 0.1585014409221902,
"grad_norm": 15.517023295570512,
"learning_rate": 2.6378896882494006e-08,
"logits/chosen": -1.9886398315429688,
"logits/rejected": -1.9846597909927368,
"logps/chosen": -1.0078786611557007,
"logps/rejected": -1.1769925355911255,
"loss": 1.1505,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0157573223114014,
"rewards/margins": 0.3382275700569153,
"rewards/rejected": -2.353985071182251,
"step": 220
},
{
"epoch": 0.16570605187319884,
"grad_norm": 17.085486504816398,
"learning_rate": 2.7577937649880097e-08,
"logits/chosen": -2.0190815925598145,
"logits/rejected": -2.0195024013519287,
"logps/chosen": -1.01227605342865,
"logps/rejected": -1.1264681816101074,
"loss": 1.2015,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0245521068573,
"rewards/margins": 0.2283841073513031,
"rewards/rejected": -2.252936363220215,
"step": 230
},
{
"epoch": 0.1729106628242075,
"grad_norm": 22.24970353019487,
"learning_rate": 2.8776978417266184e-08,
"logits/chosen": -2.0530002117156982,
"logits/rejected": -2.0478739738464355,
"logps/chosen": -1.0617554187774658,
"logps/rejected": -1.1395084857940674,
"loss": 1.2618,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1235108375549316,
"rewards/margins": 0.15550628304481506,
"rewards/rejected": -2.2790169715881348,
"step": 240
},
{
"epoch": 0.18011527377521613,
"grad_norm": 19.11674829624308,
"learning_rate": 2.997601918465228e-08,
"logits/chosen": -1.9721157550811768,
"logits/rejected": -1.968205451965332,
"logps/chosen": -1.0830333232879639,
"logps/rejected": -1.1736047267913818,
"loss": 1.2384,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1660666465759277,
"rewards/margins": 0.18114277720451355,
"rewards/rejected": -2.3472094535827637,
"step": 250
},
{
"epoch": 0.1873198847262248,
"grad_norm": 21.26207716688974,
"learning_rate": 3.1175059952038366e-08,
"logits/chosen": -1.9892946481704712,
"logits/rejected": -1.997536063194275,
"logps/chosen": -1.1055234670639038,
"logps/rejected": -1.2160685062408447,
"loss": 1.2139,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.2110469341278076,
"rewards/margins": 0.22109034657478333,
"rewards/rejected": -2.4321370124816895,
"step": 260
},
{
"epoch": 0.19452449567723343,
"grad_norm": 20.68682788684347,
"learning_rate": 3.237410071942446e-08,
"logits/chosen": -2.064192295074463,
"logits/rejected": -2.0562119483947754,
"logps/chosen": -1.0712614059448242,
"logps/rejected": -1.2003023624420166,
"loss": 1.1803,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1425228118896484,
"rewards/margins": 0.2580817937850952,
"rewards/rejected": -2.400604724884033,
"step": 270
},
{
"epoch": 0.2017291066282421,
"grad_norm": 25.11253609162999,
"learning_rate": 3.3573141486810555e-08,
"logits/chosen": -2.008389472961426,
"logits/rejected": -2.0066072940826416,
"logps/chosen": -0.9357258677482605,
"logps/rejected": -1.049773097038269,
"loss": 1.1981,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.871451735496521,
"rewards/margins": 0.22809453308582306,
"rewards/rejected": -2.099546194076538,
"step": 280
},
{
"epoch": 0.20893371757925072,
"grad_norm": 21.796986905635144,
"learning_rate": 3.477218225419664e-08,
"logits/chosen": -2.0430212020874023,
"logits/rejected": -2.044867992401123,
"logps/chosen": -1.0136518478393555,
"logps/rejected": -1.1080281734466553,
"loss": 1.2347,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.027303695678711,
"rewards/margins": 0.1887526512145996,
"rewards/rejected": -2.2160563468933105,
"step": 290
},
{
"epoch": 0.21613832853025935,
"grad_norm": 20.372987042015918,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -2.0230350494384766,
"logits/rejected": -2.0147769451141357,
"logps/chosen": -1.0902057886123657,
"logps/rejected": -1.191245436668396,
"loss": 1.2137,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1804115772247314,
"rewards/margins": 0.20207944512367249,
"rewards/rejected": -2.382490873336792,
"step": 300
},
{
"epoch": 0.22334293948126802,
"grad_norm": 18.57882925465559,
"learning_rate": 3.717026378896883e-08,
"logits/chosen": -1.9549649953842163,
"logits/rejected": -1.9548736810684204,
"logps/chosen": -1.0871379375457764,
"logps/rejected": -1.1725897789001465,
"loss": 1.2377,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1742758750915527,
"rewards/margins": 0.17090332508087158,
"rewards/rejected": -2.345179557800293,
"step": 310
},
{
"epoch": 0.23054755043227665,
"grad_norm": 15.975684555438873,
"learning_rate": 3.836930455635491e-08,
"logits/chosen": -2.0300118923187256,
"logits/rejected": -2.0213980674743652,
"logps/chosen": -1.0087685585021973,
"logps/rejected": -1.1406135559082031,
"loss": 1.1934,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0175371170043945,
"rewards/margins": 0.2636898159980774,
"rewards/rejected": -2.2812271118164062,
"step": 320
},
{
"epoch": 0.2377521613832853,
"grad_norm": 15.772574632019396,
"learning_rate": 3.9568345323741003e-08,
"logits/chosen": -2.0156402587890625,
"logits/rejected": -2.0179450511932373,
"logps/chosen": -1.0460145473480225,
"logps/rejected": -1.069695234298706,
"loss": 1.3364,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -2.092029094696045,
"rewards/margins": 0.04736141860485077,
"rewards/rejected": -2.139390468597412,
"step": 330
},
{
"epoch": 0.24495677233429394,
"grad_norm": 18.38190578321181,
"learning_rate": 4.07673860911271e-08,
"logits/chosen": -2.0608153343200684,
"logits/rejected": -2.055126667022705,
"logps/chosen": -1.0875434875488281,
"logps/rejected": -1.167794108390808,
"loss": 1.2366,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1750869750976562,
"rewards/margins": 0.16050121188163757,
"rewards/rejected": -2.335588216781616,
"step": 340
},
{
"epoch": 0.2521613832853026,
"grad_norm": 19.343882527589155,
"learning_rate": 4.1966426858513185e-08,
"logits/chosen": -1.9883911609649658,
"logits/rejected": -1.9827582836151123,
"logps/chosen": -0.9889104962348938,
"logps/rejected": -1.1158192157745361,
"loss": 1.1858,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9778209924697876,
"rewards/margins": 0.25381720066070557,
"rewards/rejected": -2.2316384315490723,
"step": 350
},
{
"epoch": 0.25936599423631124,
"grad_norm": 21.595830091643787,
"learning_rate": 4.3165467625899276e-08,
"logits/chosen": -1.9964408874511719,
"logits/rejected": -1.9924728870391846,
"logps/chosen": -1.0861265659332275,
"logps/rejected": -1.2027567625045776,
"loss": 1.1971,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.172253131866455,
"rewards/margins": 0.23326051235198975,
"rewards/rejected": -2.4055135250091553,
"step": 360
},
{
"epoch": 0.2665706051873199,
"grad_norm": 18.205487526741695,
"learning_rate": 4.4364508393285374e-08,
"logits/chosen": -2.007871389389038,
"logits/rejected": -2.007930278778076,
"logps/chosen": -1.05240797996521,
"logps/rejected": -1.1806955337524414,
"loss": 1.1777,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.10481595993042,
"rewards/margins": 0.25657495856285095,
"rewards/rejected": -2.361391067504883,
"step": 370
},
{
"epoch": 0.2737752161383285,
"grad_norm": 16.5239346092299,
"learning_rate": 4.556354916067146e-08,
"logits/chosen": -2.0331404209136963,
"logits/rejected": -2.0373125076293945,
"logps/chosen": -1.0126136541366577,
"logps/rejected": -1.0856488943099976,
"loss": 1.2688,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0252273082733154,
"rewards/margins": 0.14607074856758118,
"rewards/rejected": -2.171297788619995,
"step": 380
},
{
"epoch": 0.28097982708933716,
"grad_norm": 15.274744597058485,
"learning_rate": 4.676258992805755e-08,
"logits/chosen": -2.0328099727630615,
"logits/rejected": -2.0266880989074707,
"logps/chosen": -1.0222868919372559,
"logps/rejected": -1.1483510732650757,
"loss": 1.1822,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0445737838745117,
"rewards/margins": 0.2521281838417053,
"rewards/rejected": -2.2967021465301514,
"step": 390
},
{
"epoch": 0.2881844380403458,
"grad_norm": 19.044775104319285,
"learning_rate": 4.796163069544365e-08,
"logits/chosen": -2.0326080322265625,
"logits/rejected": -2.0330114364624023,
"logps/chosen": -0.9962165951728821,
"logps/rejected": -1.049239993095398,
"loss": 1.2716,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9924331903457642,
"rewards/margins": 0.106046661734581,
"rewards/rejected": -2.098479986190796,
"step": 400
},
{
"epoch": 0.2953890489913545,
"grad_norm": 18.59587302480316,
"learning_rate": 4.916067146282973e-08,
"logits/chosen": -2.0307698249816895,
"logits/rejected": -2.028846025466919,
"logps/chosen": -1.0742970705032349,
"logps/rejected": -1.1461079120635986,
"loss": 1.2611,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.1485941410064697,
"rewards/margins": 0.1436215192079544,
"rewards/rejected": -2.2922158241271973,
"step": 410
},
{
"epoch": 0.3025936599423631,
"grad_norm": 16.815742736953002,
"learning_rate": 4.999992091672379e-08,
"logits/chosen": -2.0101230144500732,
"logits/rejected": -2.0144143104553223,
"logps/chosen": -1.0453675985336304,
"logps/rejected": -1.1239204406738281,
"loss": 1.2427,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0907351970672607,
"rewards/margins": 0.1571054756641388,
"rewards/rejected": -2.2478408813476562,
"step": 420
},
{
"epoch": 0.30979827089337175,
"grad_norm": 17.75452942549568,
"learning_rate": 4.999851500573209e-08,
"logits/chosen": -1.9898436069488525,
"logits/rejected": -1.9907649755477905,
"logps/chosen": -1.0584254264831543,
"logps/rejected": -1.0997257232666016,
"loss": 1.3009,
"rewards/accuracies": 0.46875,
"rewards/chosen": -2.1168508529663086,
"rewards/margins": 0.08260075747966766,
"rewards/rejected": -2.199451446533203,
"step": 430
},
{
"epoch": 0.3170028818443804,
"grad_norm": 15.929802428494124,
"learning_rate": 4.999535180235972e-08,
"logits/chosen": -1.9864879846572876,
"logits/rejected": -1.9866752624511719,
"logps/chosen": -1.0216079950332642,
"logps/rejected": -1.143937110900879,
"loss": 1.1961,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0432159900665283,
"rewards/margins": 0.2446581870317459,
"rewards/rejected": -2.287874221801758,
"step": 440
},
{
"epoch": 0.3242074927953891,
"grad_norm": 17.924032675250256,
"learning_rate": 4.9990431528966836e-08,
"logits/chosen": -2.0104360580444336,
"logits/rejected": -2.006624221801758,
"logps/chosen": -1.1455620527267456,
"logps/rejected": -1.1853464841842651,
"loss": 1.3022,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.291124105453491,
"rewards/margins": 0.07956884801387787,
"rewards/rejected": -2.3706929683685303,
"step": 450
},
{
"epoch": 0.3314121037463977,
"grad_norm": 24.17220460895587,
"learning_rate": 4.9983754531428326e-08,
"logits/chosen": -2.0079081058502197,
"logits/rejected": -2.00258731842041,
"logps/chosen": -1.1706523895263672,
"logps/rejected": -1.2871944904327393,
"loss": 1.2011,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.3413047790527344,
"rewards/margins": 0.23308411240577698,
"rewards/rejected": -2.5743889808654785,
"step": 460
},
{
"epoch": 0.33861671469740634,
"grad_norm": 22.958829143377635,
"learning_rate": 4.997532127910954e-08,
"logits/chosen": -2.04119873046875,
"logits/rejected": -2.029076099395752,
"logps/chosen": -1.100618839263916,
"logps/rejected": -1.202358365058899,
"loss": 1.2196,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.201237678527832,
"rewards/margins": 0.20347890257835388,
"rewards/rejected": -2.404716730117798,
"step": 470
},
{
"epoch": 0.345821325648415,
"grad_norm": 21.220771734165737,
"learning_rate": 4.996513236483331e-08,
"logits/chosen": -2.10054087638855,
"logits/rejected": -2.090383768081665,
"logps/chosen": -0.9847520589828491,
"logps/rejected": -1.1071968078613281,
"loss": 1.1835,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9695041179656982,
"rewards/margins": 0.24488914012908936,
"rewards/rejected": -2.2143936157226562,
"step": 480
},
{
"epoch": 0.3530259365994236,
"grad_norm": 18.815281247411335,
"learning_rate": 4.9953188504838225e-08,
"logits/chosen": -2.023686408996582,
"logits/rejected": -2.0228590965270996,
"logps/chosen": -0.9880355596542358,
"logps/rejected": -1.1021173000335693,
"loss": 1.1932,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.9760711193084717,
"rewards/margins": 0.22816362977027893,
"rewards/rejected": -2.2042346000671387,
"step": 490
},
{
"epoch": 0.36023054755043227,
"grad_norm": 18.652142843140656,
"learning_rate": 4.993949053872834e-08,
"logits/chosen": -2.0161242485046387,
"logits/rejected": -2.0025501251220703,
"logps/chosen": -1.012613296508789,
"logps/rejected": -1.140053391456604,
"loss": 1.1806,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.025226593017578,
"rewards/margins": 0.25488021969795227,
"rewards/rejected": -2.280106782913208,
"step": 500
},
{
"epoch": 0.36743515850144093,
"grad_norm": 19.243693238325108,
"learning_rate": 4.9924039429414086e-08,
"logits/chosen": -2.087251663208008,
"logits/rejected": -2.080854654312134,
"logps/chosen": -1.0440865755081177,
"logps/rejected": -1.158582091331482,
"loss": 1.2076,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0881731510162354,
"rewards/margins": 0.2289910614490509,
"rewards/rejected": -2.317164182662964,
"step": 510
},
{
"epoch": 0.3746397694524496,
"grad_norm": 16.084908380302842,
"learning_rate": 4.990683626304467e-08,
"logits/chosen": -2.010878801345825,
"logits/rejected": -2.009476900100708,
"logps/chosen": -1.1068270206451416,
"logps/rejected": -1.2030669450759888,
"loss": 1.2196,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.213654041290283,
"rewards/margins": 0.19247998297214508,
"rewards/rejected": -2.4061338901519775,
"step": 520
},
{
"epoch": 0.3818443804034582,
"grad_norm": 17.74791604713886,
"learning_rate": 4.9887882248931646e-08,
"logits/chosen": -1.9751720428466797,
"logits/rejected": -1.9651315212249756,
"logps/chosen": -0.9842063188552856,
"logps/rejected": -1.0612623691558838,
"loss": 1.25,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9684126377105713,
"rewards/margins": 0.1541123390197754,
"rewards/rejected": -2.1225247383117676,
"step": 530
},
{
"epoch": 0.38904899135446686,
"grad_norm": 22.74397828503106,
"learning_rate": 4.986717871946393e-08,
"logits/chosen": -2.0001473426818848,
"logits/rejected": -1.9932626485824585,
"logps/chosen": -1.0306423902511597,
"logps/rejected": -1.132361650466919,
"loss": 1.2207,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0612847805023193,
"rewards/margins": 0.20343880355358124,
"rewards/rejected": -2.264723300933838,
"step": 540
},
{
"epoch": 0.3962536023054755,
"grad_norm": 17.28890965818364,
"learning_rate": 4.984472713001416e-08,
"logits/chosen": -1.9692100286483765,
"logits/rejected": -1.9698715209960938,
"logps/chosen": -1.0003505945205688,
"logps/rejected": -1.0772594213485718,
"loss": 1.2685,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0007011890411377,
"rewards/margins": 0.1538175493478775,
"rewards/rejected": -2.1545188426971436,
"step": 550
},
{
"epoch": 0.4034582132564842,
"grad_norm": 17.119812277596985,
"learning_rate": 4.982052905883637e-08,
"logits/chosen": -2.0280909538269043,
"logits/rejected": -2.0286812782287598,
"logps/chosen": -1.0809977054595947,
"logps/rejected": -1.1807363033294678,
"loss": 1.2255,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1619954109191895,
"rewards/margins": 0.1994771659374237,
"rewards/rejected": -2.3614726066589355,
"step": 560
},
{
"epoch": 0.4106628242074928,
"grad_norm": 16.296045615559738,
"learning_rate": 4.979458620695505e-08,
"logits/chosen": -2.0341217517852783,
"logits/rejected": -2.0200934410095215,
"logps/chosen": -1.0948175191879272,
"logps/rejected": -1.2078857421875,
"loss": 1.2102,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1896350383758545,
"rewards/margins": 0.22613653540611267,
"rewards/rejected": -2.415771484375,
"step": 570
},
{
"epoch": 0.41786743515850144,
"grad_norm": 19.547805034076156,
"learning_rate": 4.976690039804555e-08,
"logits/chosen": -2.0328009128570557,
"logits/rejected": -2.03126859664917,
"logps/chosen": -0.9873042106628418,
"logps/rejected": -1.0673751831054688,
"loss": 1.2467,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9746084213256836,
"rewards/margins": 0.16014157235622406,
"rewards/rejected": -2.1347503662109375,
"step": 580
},
{
"epoch": 0.4250720461095101,
"grad_norm": 21.43040480262364,
"learning_rate": 4.973747357830592e-08,
"logits/chosen": -2.020263195037842,
"logits/rejected": -2.0205166339874268,
"logps/chosen": -1.0274614095687866,
"logps/rejected": -1.164903998374939,
"loss": 1.1672,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0549228191375732,
"rewards/margins": 0.2748851776123047,
"rewards/rejected": -2.329807996749878,
"step": 590
},
{
"epoch": 0.4322766570605187,
"grad_norm": 19.58018590074545,
"learning_rate": 4.970630781632009e-08,
"logits/chosen": -2.076049566268921,
"logits/rejected": -2.072026491165161,
"logps/chosen": -1.0331029891967773,
"logps/rejected": -1.1752078533172607,
"loss": 1.1687,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0662059783935547,
"rewards/margins": 0.2842100262641907,
"rewards/rejected": -2.3504157066345215,
"step": 600
},
{
"epoch": 0.43948126801152737,
"grad_norm": 21.116047152924747,
"learning_rate": 4.967340530291242e-08,
"logits/chosen": -2.02950382232666,
"logits/rejected": -2.01965594291687,
"logps/chosen": -1.09267258644104,
"logps/rejected": -1.150689959526062,
"loss": 1.2681,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.18534517288208,
"rewards/margins": 0.11603420972824097,
"rewards/rejected": -2.301379919052124,
"step": 610
},
{
"epoch": 0.44668587896253603,
"grad_norm": 24.714444992958434,
"learning_rate": 4.9638768350993755e-08,
"logits/chosen": -2.0273375511169434,
"logits/rejected": -2.019911289215088,
"logps/chosen": -0.9958856701850891,
"logps/rejected": -1.082914113998413,
"loss": 1.2353,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.9917713403701782,
"rewards/margins": 0.1740569919347763,
"rewards/rejected": -2.165828227996826,
"step": 620
},
{
"epoch": 0.4538904899135447,
"grad_norm": 20.802905839756523,
"learning_rate": 4.9602399395398786e-08,
"logits/chosen": -2.040907859802246,
"logits/rejected": -2.040799379348755,
"logps/chosen": -1.0272337198257446,
"logps/rejected": -1.1544668674468994,
"loss": 1.1828,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0544674396514893,
"rewards/margins": 0.2544659674167633,
"rewards/rejected": -2.308933734893799,
"step": 630
},
{
"epoch": 0.4610951008645533,
"grad_norm": 16.14908066968672,
"learning_rate": 4.9564300992714914e-08,
"logits/chosen": -1.9591153860092163,
"logits/rejected": -1.9602371454238892,
"logps/chosen": -1.0113928318023682,
"logps/rejected": -1.1170381307601929,
"loss": 1.2104,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0227856636047363,
"rewards/margins": 0.211290642619133,
"rewards/rejected": -2.2340762615203857,
"step": 640
},
{
"epoch": 0.46829971181556196,
"grad_norm": 21.97651048317045,
"learning_rate": 4.952447582110253e-08,
"logits/chosen": -2.0540075302124023,
"logits/rejected": -2.039557933807373,
"logps/chosen": -1.037952184677124,
"logps/rejected": -1.117681860923767,
"loss": 1.2477,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.075904369354248,
"rewards/margins": 0.15945938229560852,
"rewards/rejected": -2.235363721847534,
"step": 650
},
{
"epoch": 0.4755043227665706,
"grad_norm": 23.865036653626944,
"learning_rate": 4.948292668010676e-08,
"logits/chosen": -2.033937454223633,
"logits/rejected": -2.0348212718963623,
"logps/chosen": -1.0879608392715454,
"logps/rejected": -1.1745898723602295,
"loss": 1.2453,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.175921678543091,
"rewards/margins": 0.17325839400291443,
"rewards/rejected": -2.349179744720459,
"step": 660
},
{
"epoch": 0.4827089337175792,
"grad_norm": 20.3891833338485,
"learning_rate": 4.943965649046064e-08,
"logits/chosen": -2.00368332862854,
"logits/rejected": -1.994360327720642,
"logps/chosen": -1.0627825260162354,
"logps/rejected": -1.1664403676986694,
"loss": 1.2155,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1255650520324707,
"rewards/margins": 0.2073155641555786,
"rewards/rejected": -2.332880735397339,
"step": 670
},
{
"epoch": 0.4899135446685879,
"grad_norm": 19.075566809881007,
"learning_rate": 4.9394668293879835e-08,
"logits/chosen": -1.9593700170516968,
"logits/rejected": -1.950269341468811,
"logps/chosen": -1.0373146533966064,
"logps/rejected": -1.1066360473632812,
"loss": 1.2628,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.074629306793213,
"rewards/margins": 0.13864275813102722,
"rewards/rejected": -2.2132720947265625,
"step": 680
},
{
"epoch": 0.49711815561959655,
"grad_norm": 24.96158275365681,
"learning_rate": 4.93479652528488e-08,
"logits/chosen": -2.0235979557037354,
"logits/rejected": -2.0184168815612793,
"logps/chosen": -1.1050894260406494,
"logps/rejected": -1.2094438076019287,
"loss": 1.226,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.210178852081299,
"rewards/margins": 0.20870868861675262,
"rewards/rejected": -2.4188876152038574,
"step": 690
},
{
"epoch": 0.5043227665706052,
"grad_norm": 20.196803734367005,
"learning_rate": 4.929955065039848e-08,
"logits/chosen": -2.0198333263397217,
"logits/rejected": -2.014254093170166,
"logps/chosen": -1.0191075801849365,
"logps/rejected": -1.1514732837677002,
"loss": 1.183,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.038215160369873,
"rewards/margins": 0.2647314965724945,
"rewards/rejected": -2.3029465675354004,
"step": 700
},
{
"epoch": 0.5115273775216138,
"grad_norm": 19.12481684007211,
"learning_rate": 4.92494278898755e-08,
"logits/chosen": -1.9860668182373047,
"logits/rejected": -1.9829334020614624,
"logps/chosen": -0.8973162770271301,
"logps/rejected": -1.0221717357635498,
"loss": 1.1965,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.7946325540542603,
"rewards/margins": 0.2497110813856125,
"rewards/rejected": -2.0443434715270996,
"step": 710
},
{
"epoch": 0.5187319884726225,
"grad_norm": 18.885298336896575,
"learning_rate": 4.9197600494702955e-08,
"logits/chosen": -2.0109028816223145,
"logits/rejected": -2.0047824382781982,
"logps/chosen": -1.042280912399292,
"logps/rejected": -1.1657941341400146,
"loss": 1.1849,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.084561824798584,
"rewards/margins": 0.2470264732837677,
"rewards/rejected": -2.3315882682800293,
"step": 720
},
{
"epoch": 0.5259365994236311,
"grad_norm": 20.51392204287621,
"learning_rate": 4.9144072108132725e-08,
"logits/chosen": -2.0101022720336914,
"logits/rejected": -1.9990341663360596,
"logps/chosen": -1.0220484733581543,
"logps/rejected": -1.1054035425186157,
"loss": 1.2509,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0440969467163086,
"rewards/margins": 0.16670992970466614,
"rewards/rejected": -2.2108070850372314,
"step": 730
},
{
"epoch": 0.5331412103746398,
"grad_norm": 17.784848123993633,
"learning_rate": 4.908884649298937e-08,
"logits/chosen": -2.0006046295166016,
"logits/rejected": -2.0075409412384033,
"logps/chosen": -1.0191365480422974,
"logps/rejected": -1.079302430152893,
"loss": 1.2835,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -2.0382730960845947,
"rewards/margins": 0.12033157050609589,
"rewards/rejected": -2.158604860305786,
"step": 740
},
{
"epoch": 0.5403458213256485,
"grad_norm": 23.05051802988211,
"learning_rate": 4.903192753140557e-08,
"logits/chosen": -2.0201849937438965,
"logits/rejected": -2.0148415565490723,
"logps/chosen": -1.1010781526565552,
"logps/rejected": -1.1906977891921997,
"loss": 1.2389,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2021563053131104,
"rewards/margins": 0.1792391687631607,
"rewards/rejected": -2.3813955783843994,
"step": 750
},
{
"epoch": 0.547550432276657,
"grad_norm": 19.69096268460173,
"learning_rate": 4.897331922454931e-08,
"logits/chosen": -1.9745019674301147,
"logits/rejected": -1.9782997369766235,
"logps/chosen": -1.0038034915924072,
"logps/rejected": -1.1137539148330688,
"loss": 1.216,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0076069831848145,
"rewards/margins": 0.21990080177783966,
"rewards/rejected": -2.2275078296661377,
"step": 760
},
{
"epoch": 0.5547550432276657,
"grad_norm": 20.603497657086212,
"learning_rate": 4.891302569234256e-08,
"logits/chosen": -1.9754327535629272,
"logits/rejected": -1.978355050086975,
"logps/chosen": -0.9768314361572266,
"logps/rejected": -1.1296206712722778,
"loss": 1.1631,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9536628723144531,
"rewards/margins": 0.30557847023010254,
"rewards/rejected": -2.2592413425445557,
"step": 770
},
{
"epoch": 0.5619596541786743,
"grad_norm": 22.064873844463353,
"learning_rate": 4.8851051173171656e-08,
"logits/chosen": -1.9894813299179077,
"logits/rejected": -1.988013505935669,
"logps/chosen": -1.0405890941619873,
"logps/rejected": -1.1219489574432373,
"loss": 1.2394,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0811781883239746,
"rewards/margins": 0.1627195179462433,
"rewards/rejected": -2.2438979148864746,
"step": 780
},
{
"epoch": 0.569164265129683,
"grad_norm": 17.44205394859882,
"learning_rate": 4.87874000235894e-08,
"logits/chosen": -2.015829563140869,
"logits/rejected": -2.010009765625,
"logps/chosen": -1.075958490371704,
"logps/rejected": -1.2331421375274658,
"loss": 1.1593,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.151916980743408,
"rewards/margins": 0.3143673837184906,
"rewards/rejected": -2.4662842750549316,
"step": 790
},
{
"epoch": 0.5763688760806917,
"grad_norm": 19.511392524751066,
"learning_rate": 4.872207671800876e-08,
"logits/chosen": -2.0384624004364014,
"logits/rejected": -2.0348961353302,
"logps/chosen": -1.0448932647705078,
"logps/rejected": -1.1220283508300781,
"loss": 1.2573,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0897865295410156,
"rewards/margins": 0.1542699635028839,
"rewards/rejected": -2.2440567016601562,
"step": 800
},
{
"epoch": 0.5835734870317003,
"grad_norm": 16.00392428501834,
"learning_rate": 4.865508584838841e-08,
"logits/chosen": -2.020700693130493,
"logits/rejected": -2.0231757164001465,
"logps/chosen": -1.0133837461471558,
"logps/rejected": -1.103539228439331,
"loss": 1.2329,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0267674922943115,
"rewards/margins": 0.18031062185764313,
"rewards/rejected": -2.207078456878662,
"step": 810
},
{
"epoch": 0.590778097982709,
"grad_norm": 21.25089832156162,
"learning_rate": 4.858643212390985e-08,
"logits/chosen": -2.02546763420105,
"logits/rejected": -2.015793561935425,
"logps/chosen": -1.029206395149231,
"logps/rejected": -1.115379810333252,
"loss": 1.2494,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.058412790298462,
"rewards/margins": 0.1723467856645584,
"rewards/rejected": -2.230759620666504,
"step": 820
},
{
"epoch": 0.5979827089337176,
"grad_norm": 18.36102437546264,
"learning_rate": 4.851612037064643e-08,
"logits/chosen": -1.9968347549438477,
"logits/rejected": -1.9947017431259155,
"logps/chosen": -0.9606590270996094,
"logps/rejected": -1.0800572633743286,
"loss": 1.2043,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9213180541992188,
"rewards/margins": 0.2387961596250534,
"rewards/rejected": -2.1601145267486572,
"step": 830
},
{
"epoch": 0.6051873198847262,
"grad_norm": 15.958170019881827,
"learning_rate": 4.8444155531224065e-08,
"logits/chosen": -2.0284435749053955,
"logits/rejected": -2.028543472290039,
"logps/chosen": -1.0880385637283325,
"logps/rejected": -1.1600936651229858,
"loss": 1.2625,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.176077127456665,
"rewards/margins": 0.14411017298698425,
"rewards/rejected": -2.3201873302459717,
"step": 840
},
{
"epoch": 0.6123919308357348,
"grad_norm": 15.447742706439062,
"learning_rate": 4.8370542664473805e-08,
"logits/chosen": -2.034883499145508,
"logits/rejected": -2.029095411300659,
"logps/chosen": -1.0500915050506592,
"logps/rejected": -1.1546186208724976,
"loss": 1.2247,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1001830101013184,
"rewards/margins": 0.20905427634716034,
"rewards/rejected": -2.309237241744995,
"step": 850
},
{
"epoch": 0.6195965417867435,
"grad_norm": 17.975604985927344,
"learning_rate": 4.829528694507624e-08,
"logits/chosen": -2.0074715614318848,
"logits/rejected": -2.003307342529297,
"logps/chosen": -1.1618878841400146,
"logps/rejected": -1.2185773849487305,
"loss": 1.2792,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.3237757682800293,
"rewards/margins": 0.11337918043136597,
"rewards/rejected": -2.437154769897461,
"step": 860
},
{
"epoch": 0.6268011527377522,
"grad_norm": 20.210558301206753,
"learning_rate": 4.821839366319768e-08,
"logits/chosen": -2.0488550662994385,
"logits/rejected": -2.0428953170776367,
"logps/chosen": -1.004872441291809,
"logps/rejected": -1.1229604482650757,
"loss": 1.1965,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.009744882583618,
"rewards/margins": 0.2361760139465332,
"rewards/rejected": -2.2459208965301514,
"step": 870
},
{
"epoch": 0.6340057636887608,
"grad_norm": 19.796801319824315,
"learning_rate": 4.813986822411833e-08,
"logits/chosen": -2.0358963012695312,
"logits/rejected": -2.0338990688323975,
"logps/chosen": -1.0155360698699951,
"logps/rejected": -1.0795470476150513,
"loss": 1.2675,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0310721397399902,
"rewards/margins": 0.12802186608314514,
"rewards/rejected": -2.1590940952301025,
"step": 880
},
{
"epoch": 0.6412103746397695,
"grad_norm": 19.799887695381567,
"learning_rate": 4.805971614785231e-08,
"logits/chosen": -2.0680534839630127,
"logits/rejected": -2.0668766498565674,
"logps/chosen": -1.015794038772583,
"logps/rejected": -1.1115624904632568,
"loss": 1.2198,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.031588077545166,
"rewards/margins": 0.19153663516044617,
"rewards/rejected": -2.2231249809265137,
"step": 890
},
{
"epoch": 0.6484149855907781,
"grad_norm": 20.13934417953023,
"learning_rate": 4.797794306875963e-08,
"logits/chosen": -1.9745270013809204,
"logits/rejected": -1.9761186838150024,
"logps/chosen": -1.1419258117675781,
"logps/rejected": -1.2145435810089111,
"loss": 1.2677,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.2838516235351562,
"rewards/margins": 0.14523524045944214,
"rewards/rejected": -2.4290871620178223,
"step": 900
},
{
"epoch": 0.6556195965417867,
"grad_norm": 20.244043065077097,
"learning_rate": 4.7894554735150076e-08,
"logits/chosen": -1.9853111505508423,
"logits/rejected": -1.9889112710952759,
"logps/chosen": -1.0432493686676025,
"logps/rejected": -1.1088694334030151,
"loss": 1.2634,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.086498737335205,
"rewards/margins": 0.13123992085456848,
"rewards/rejected": -2.2177388668060303,
"step": 910
},
{
"epoch": 0.6628242074927954,
"grad_norm": 23.228641230109883,
"learning_rate": 4.7809557008879185e-08,
"logits/chosen": -2.0164051055908203,
"logits/rejected": -2.0110771656036377,
"logps/chosen": -0.9736042022705078,
"logps/rejected": -1.061522126197815,
"loss": 1.2385,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9472084045410156,
"rewards/margins": 0.17583578824996948,
"rewards/rejected": -2.12304425239563,
"step": 920
},
{
"epoch": 0.670028818443804,
"grad_norm": 18.047096637073768,
"learning_rate": 4.772295586493613e-08,
"logits/chosen": -2.056691884994507,
"logits/rejected": -2.0538461208343506,
"logps/chosen": -1.0346996784210205,
"logps/rejected": -1.1515998840332031,
"loss": 1.1922,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.069399356842041,
"rewards/margins": 0.23380064964294434,
"rewards/rejected": -2.3031997680664062,
"step": 930
},
{
"epoch": 0.6772334293948127,
"grad_norm": 19.617510099326854,
"learning_rate": 4.763475739102374e-08,
"logits/chosen": -2.0092320442199707,
"logits/rejected": -2.015103816986084,
"logps/chosen": -1.1271753311157227,
"logps/rejected": -1.1942684650421143,
"loss": 1.2568,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.2543506622314453,
"rewards/margins": 0.13418647646903992,
"rewards/rejected": -2.3885369300842285,
"step": 940
},
{
"epoch": 0.6844380403458213,
"grad_norm": 15.419611013936972,
"learning_rate": 4.754496778713054e-08,
"logits/chosen": -1.9684407711029053,
"logits/rejected": -1.9725189208984375,
"logps/chosen": -1.011788249015808,
"logps/rejected": -1.1345211267471313,
"loss": 1.2007,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.023576498031616,
"rewards/margins": 0.24546551704406738,
"rewards/rejected": -2.2690422534942627,
"step": 950
},
{
"epoch": 0.69164265129683,
"grad_norm": 21.26727298981788,
"learning_rate": 4.7453593365094926e-08,
"logits/chosen": -2.0424587726593018,
"logits/rejected": -2.0415501594543457,
"logps/chosen": -1.0493271350860596,
"logps/rejected": -1.1593568325042725,
"loss": 1.2079,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.098654270172119,
"rewards/margins": 0.2200593203306198,
"rewards/rejected": -2.318713665008545,
"step": 960
},
{
"epoch": 0.6988472622478387,
"grad_norm": 21.287126597379253,
"learning_rate": 4.736064054816145e-08,
"logits/chosen": -2.0447025299072266,
"logits/rejected": -2.040843963623047,
"logps/chosen": -0.9683746099472046,
"logps/rejected": -1.0945460796356201,
"loss": 1.1791,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.9367492198944092,
"rewards/margins": 0.25234299898147583,
"rewards/rejected": -2.1890921592712402,
"step": 970
},
{
"epoch": 0.7060518731988472,
"grad_norm": 17.20400787363182,
"learning_rate": 4.726611587052933e-08,
"logits/chosen": -1.9701964855194092,
"logits/rejected": -1.9697643518447876,
"logps/chosen": -1.1084095239639282,
"logps/rejected": -1.2358492612838745,
"loss": 1.1795,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.2168190479278564,
"rewards/margins": 0.25487983226776123,
"rewards/rejected": -2.471698522567749,
"step": 980
},
{
"epoch": 0.7132564841498559,
"grad_norm": 22.283752817420396,
"learning_rate": 4.71700259768931e-08,
"logits/chosen": -2.0302042961120605,
"logits/rejected": -2.027015447616577,
"logps/chosen": -1.1091606616973877,
"logps/rejected": -1.2056801319122314,
"loss": 1.234,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.2183213233947754,
"rewards/margins": 0.19303929805755615,
"rewards/rejected": -2.411360263824463,
"step": 990
},
{
"epoch": 0.7204610951008645,
"grad_norm": 19.81391793524057,
"learning_rate": 4.707237762197549e-08,
"logits/chosen": -2.0068042278289795,
"logits/rejected": -2.0036396980285645,
"logps/chosen": -1.0078332424163818,
"logps/rejected": -1.1272451877593994,
"loss": 1.2127,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0156664848327637,
"rewards/margins": 0.23882417380809784,
"rewards/rejected": -2.254490375518799,
"step": 1000
},
{
"epoch": 0.7276657060518732,
"grad_norm": 23.576990411457878,
"learning_rate": 4.697317767005265e-08,
"logits/chosen": -2.0253381729125977,
"logits/rejected": -2.0218160152435303,
"logps/chosen": -1.0018768310546875,
"logps/rejected": -1.0942353010177612,
"loss": 1.2568,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.003753662109375,
"rewards/margins": 0.18471679091453552,
"rewards/rejected": -2.1884706020355225,
"step": 1010
},
{
"epoch": 0.7348703170028819,
"grad_norm": 17.3299238751784,
"learning_rate": 4.6872433094471577e-08,
"logits/chosen": -2.0205771923065186,
"logits/rejected": -2.0157430171966553,
"logps/chosen": -1.0319100618362427,
"logps/rejected": -1.1281745433807373,
"loss": 1.2108,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0638201236724854,
"rewards/margins": 0.19252923130989075,
"rewards/rejected": -2.2563490867614746,
"step": 1020
},
{
"epoch": 0.7420749279538905,
"grad_norm": 16.502866342573686,
"learning_rate": 4.677015097715994e-08,
"logits/chosen": -1.9677197933197021,
"logits/rejected": -1.9671709537506104,
"logps/chosen": -1.0225335359573364,
"logps/rejected": -1.1552186012268066,
"loss": 1.1991,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.045067071914673,
"rewards/margins": 0.26537007093429565,
"rewards/rejected": -2.3104372024536133,
"step": 1030
},
{
"epoch": 0.7492795389048992,
"grad_norm": 17.494781512319758,
"learning_rate": 4.666633850812825e-08,
"logits/chosen": -2.0190138816833496,
"logits/rejected": -2.0128960609436035,
"logps/chosen": -1.0130321979522705,
"logps/rejected": -1.0945827960968018,
"loss": 1.2371,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.026064395904541,
"rewards/margins": 0.163101464509964,
"rewards/rejected": -2.1891655921936035,
"step": 1040
},
{
"epoch": 0.7564841498559077,
"grad_norm": 17.52008871289576,
"learning_rate": 4.656100298496439e-08,
"logits/chosen": -1.9722799062728882,
"logits/rejected": -1.9686037302017212,
"logps/chosen": -0.9384390711784363,
"logps/rejected": -1.0692068338394165,
"loss": 1.1854,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.8768781423568726,
"rewards/margins": 0.2615353763103485,
"rewards/rejected": -2.138413667678833,
"step": 1050
},
{
"epoch": 0.7636887608069164,
"grad_norm": 17.92122527797355,
"learning_rate": 4.6454151812320715e-08,
"logits/chosen": -2.002856969833374,
"logits/rejected": -1.9969203472137451,
"logps/chosen": -1.0393486022949219,
"logps/rejected": -1.1481153964996338,
"loss": 1.2176,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0786972045898438,
"rewards/margins": 0.21753337979316711,
"rewards/rejected": -2.2962307929992676,
"step": 1060
},
{
"epoch": 0.770893371757925,
"grad_norm": 20.81341856841254,
"learning_rate": 4.6345792501393434e-08,
"logits/chosen": -2.0019800662994385,
"logits/rejected": -2.0001296997070312,
"logps/chosen": -1.074857234954834,
"logps/rejected": -1.201908826828003,
"loss": 1.2046,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.149714469909668,
"rewards/margins": 0.25410330295562744,
"rewards/rejected": -2.403817653656006,
"step": 1070
},
{
"epoch": 0.7780979827089337,
"grad_norm": 20.6783251824387,
"learning_rate": 4.6235932669394676e-08,
"logits/chosen": -2.026952028274536,
"logits/rejected": -2.02778697013855,
"logps/chosen": -1.087725043296814,
"logps/rejected": -1.1969935894012451,
"loss": 1.218,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.175450086593628,
"rewards/margins": 0.2185373604297638,
"rewards/rejected": -2.3939871788024902,
"step": 1080
},
{
"epoch": 0.7853025936599424,
"grad_norm": 24.297736541402326,
"learning_rate": 4.612458003901698e-08,
"logits/chosen": -2.035487174987793,
"logits/rejected": -2.0278096199035645,
"logps/chosen": -1.1088950634002686,
"logps/rejected": -1.2112846374511719,
"loss": 1.2279,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.217790126800537,
"rewards/margins": 0.2047790288925171,
"rewards/rejected": -2.4225692749023438,
"step": 1090
},
{
"epoch": 0.792507204610951,
"grad_norm": 23.424012674763112,
"learning_rate": 4.6011742437890476e-08,
"logits/chosen": -2.028799533843994,
"logits/rejected": -2.023322582244873,
"logps/chosen": -1.0456212759017944,
"logps/rejected": -1.179602026939392,
"loss": 1.1772,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.091242551803589,
"rewards/margins": 0.2679617702960968,
"rewards/rejected": -2.359204053878784,
"step": 1100
},
{
"epoch": 0.7997118155619597,
"grad_norm": 16.84111836422693,
"learning_rate": 4.589742779803259e-08,
"logits/chosen": -2.0229039192199707,
"logits/rejected": -2.015812397003174,
"logps/chosen": -1.008448839187622,
"logps/rejected": -1.1298694610595703,
"loss": 1.1935,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.016897678375244,
"rewards/margins": 0.2428409308195114,
"rewards/rejected": -2.2597389221191406,
"step": 1110
},
{
"epoch": 0.8069164265129684,
"grad_norm": 18.417073733768728,
"learning_rate": 4.5781644155290486e-08,
"logits/chosen": -1.9799926280975342,
"logits/rejected": -1.9722731113433838,
"logps/chosen": -1.0475791692733765,
"logps/rejected": -1.1082106828689575,
"loss": 1.2711,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.095158338546753,
"rewards/margins": 0.12126290798187256,
"rewards/rejected": -2.216421365737915,
"step": 1120
},
{
"epoch": 0.8141210374639769,
"grad_norm": 18.07389739764431,
"learning_rate": 4.566439964877613e-08,
"logits/chosen": -2.0103983879089355,
"logits/rejected": -2.0063552856445312,
"logps/chosen": -0.9987322092056274,
"logps/rejected": -1.0854665040969849,
"loss": 1.2431,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.9974644184112549,
"rewards/margins": 0.17346863448619843,
"rewards/rejected": -2.1709330081939697,
"step": 1130
},
{
"epoch": 0.8213256484149856,
"grad_norm": 16.27730585147678,
"learning_rate": 4.554570252029421e-08,
"logits/chosen": -2.0533928871154785,
"logits/rejected": -2.0521607398986816,
"logps/chosen": -1.0483338832855225,
"logps/rejected": -1.164975881576538,
"loss": 1.1998,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.096667766571045,
"rewards/margins": 0.2332839071750641,
"rewards/rejected": -2.329951763153076,
"step": 1140
},
{
"epoch": 0.8285302593659942,
"grad_norm": 17.887214280151717,
"learning_rate": 4.542556111376274e-08,
"logits/chosen": -2.045485496520996,
"logits/rejected": -2.039069175720215,
"logps/chosen": -1.0746331214904785,
"logps/rejected": -1.1668939590454102,
"loss": 1.2386,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.149266242980957,
"rewards/margins": 0.1845216453075409,
"rewards/rejected": -2.3337879180908203,
"step": 1150
},
{
"epoch": 0.8357348703170029,
"grad_norm": 23.056739915075795,
"learning_rate": 4.5303983874626506e-08,
"logits/chosen": -1.9926433563232422,
"logits/rejected": -1.9910094738006592,
"logps/chosen": -1.0387976169586182,
"logps/rejected": -1.1165183782577515,
"loss": 1.2645,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0775952339172363,
"rewards/margins": 0.15544185042381287,
"rewards/rejected": -2.233036756515503,
"step": 1160
},
{
"epoch": 0.8429394812680115,
"grad_norm": 20.255328662941217,
"learning_rate": 4.518097934926339e-08,
"logits/chosen": -1.9955031871795654,
"logits/rejected": -1.9868882894515991,
"logps/chosen": -1.01637601852417,
"logps/rejected": -1.1265536546707153,
"loss": 1.2046,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.03275203704834,
"rewards/margins": 0.22035527229309082,
"rewards/rejected": -2.2531073093414307,
"step": 1170
},
{
"epoch": 0.8501440922190202,
"grad_norm": 22.820031947974105,
"learning_rate": 4.505655618438363e-08,
"logits/chosen": -1.9624055624008179,
"logits/rejected": -1.958373785018921,
"logps/chosen": -1.0602306127548218,
"logps/rejected": -1.1650116443634033,
"loss": 1.2288,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1204612255096436,
"rewards/margins": 0.20956222712993622,
"rewards/rejected": -2.3300232887268066,
"step": 1180
},
{
"epoch": 0.8573487031700289,
"grad_norm": 17.32188484657031,
"learning_rate": 4.4930723126421945e-08,
"logits/chosen": -2.052605152130127,
"logits/rejected": -2.045747995376587,
"logps/chosen": -1.0718796253204346,
"logps/rejected": -1.1474034786224365,
"loss": 1.2515,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.143759250640869,
"rewards/margins": 0.1510476917028427,
"rewards/rejected": -2.294806957244873,
"step": 1190
},
{
"epoch": 0.8645533141210374,
"grad_norm": 22.29345313385636,
"learning_rate": 4.48034890209227e-08,
"logits/chosen": -1.9834630489349365,
"logits/rejected": -1.9713430404663086,
"logps/chosen": -1.0877046585083008,
"logps/rejected": -1.1743061542510986,
"loss": 1.2302,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1754093170166016,
"rewards/margins": 0.17320279777050018,
"rewards/rejected": -2.3486123085021973,
"step": 1200
},
{
"epoch": 0.8717579250720461,
"grad_norm": 18.453910031808036,
"learning_rate": 4.4674862811918155e-08,
"logits/chosen": -1.9687246084213257,
"logits/rejected": -1.9770466089248657,
"logps/chosen": -0.9387677907943726,
"logps/rejected": -1.091802954673767,
"loss": 1.1595,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.8775355815887451,
"rewards/margins": 0.30607035756111145,
"rewards/rejected": -2.183605909347534,
"step": 1210
},
{
"epoch": 0.8789625360230547,
"grad_norm": 17.380965389789868,
"learning_rate": 4.454485354129966e-08,
"logits/chosen": -1.9993393421173096,
"logits/rejected": -1.9949222803115845,
"logps/chosen": -1.0104951858520508,
"logps/rejected": -1.115613579750061,
"loss": 1.2192,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0209903717041016,
"rewards/margins": 0.21023674309253693,
"rewards/rejected": -2.231227159500122,
"step": 1220
},
{
"epoch": 0.8861671469740634,
"grad_norm": 17.212065292460622,
"learning_rate": 4.4413470348182124e-08,
"logits/chosen": -1.9702112674713135,
"logits/rejected": -1.957925796508789,
"logps/chosen": -0.9851275682449341,
"logps/rejected": -1.076827883720398,
"loss": 1.2315,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9702551364898682,
"rewards/margins": 0.1834007203578949,
"rewards/rejected": -2.153655767440796,
"step": 1230
},
{
"epoch": 0.8933717579250721,
"grad_norm": 21.132247964323447,
"learning_rate": 4.42807224682615e-08,
"logits/chosen": -1.9841238260269165,
"logits/rejected": -1.9821048974990845,
"logps/chosen": -0.9365342855453491,
"logps/rejected": -1.0724506378173828,
"loss": 1.1805,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.8730685710906982,
"rewards/margins": 0.2718326449394226,
"rewards/rejected": -2.1449012756347656,
"step": 1240
},
{
"epoch": 0.9005763688760807,
"grad_norm": 18.771464131402308,
"learning_rate": 4.4146619233165604e-08,
"logits/chosen": -2.0202784538269043,
"logits/rejected": -2.022472858428955,
"logps/chosen": -1.0653743743896484,
"logps/rejected": -1.2193849086761475,
"loss": 1.1677,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.130748748779297,
"rewards/margins": 0.3080212473869324,
"rewards/rejected": -2.438769817352295,
"step": 1250
},
{
"epoch": 0.9077809798270894,
"grad_norm": 24.969245977085052,
"learning_rate": 4.4011170069798126e-08,
"logits/chosen": -2.016045331954956,
"logits/rejected": -2.0211358070373535,
"logps/chosen": -1.1183704137802124,
"logps/rejected": -1.2435810565948486,
"loss": 1.1935,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.236740827560425,
"rewards/margins": 0.2504214644432068,
"rewards/rejected": -2.4871621131896973,
"step": 1260
},
{
"epoch": 0.9149855907780979,
"grad_norm": 17.906289261915187,
"learning_rate": 4.387438449967594e-08,
"logits/chosen": -1.981329321861267,
"logits/rejected": -1.9747323989868164,
"logps/chosen": -0.966105580329895,
"logps/rejected": -1.086094856262207,
"loss": 1.1911,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.93221116065979,
"rewards/margins": 0.2399786412715912,
"rewards/rejected": -2.172189712524414,
"step": 1270
},
{
"epoch": 0.9221902017291066,
"grad_norm": 21.194952039998515,
"learning_rate": 4.373627213825983e-08,
"logits/chosen": -2.0677618980407715,
"logits/rejected": -2.063303232192993,
"logps/chosen": -1.0270278453826904,
"logps/rejected": -1.1622191667556763,
"loss": 1.1834,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.054055690765381,
"rewards/margins": 0.27038270235061646,
"rewards/rejected": -2.3244383335113525,
"step": 1280
},
{
"epoch": 0.9293948126801153,
"grad_norm": 16.751477190296892,
"learning_rate": 4.359684269427848e-08,
"logits/chosen": -2.038684368133545,
"logits/rejected": -2.03769588470459,
"logps/chosen": -0.9954586029052734,
"logps/rejected": -1.0992056131362915,
"loss": 1.2108,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9909172058105469,
"rewards/margins": 0.20749418437480927,
"rewards/rejected": -2.198411226272583,
"step": 1290
},
{
"epoch": 0.9365994236311239,
"grad_norm": 23.518080524189983,
"learning_rate": 4.34561059690461e-08,
"logits/chosen": -2.0750319957733154,
"logits/rejected": -2.0769741535186768,
"logps/chosen": -1.048097014427185,
"logps/rejected": -1.1116466522216797,
"loss": 1.2718,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.09619402885437,
"rewards/margins": 0.12709912657737732,
"rewards/rejected": -2.2232933044433594,
"step": 1300
},
{
"epoch": 0.9438040345821326,
"grad_norm": 21.385571779096182,
"learning_rate": 4.3314071855773314e-08,
"logits/chosen": -2.0412631034851074,
"logits/rejected": -2.0419461727142334,
"logps/chosen": -0.9842621684074402,
"logps/rejected": -1.079594612121582,
"loss": 1.2226,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9685243368148804,
"rewards/margins": 0.19066500663757324,
"rewards/rejected": -2.159189224243164,
"step": 1310
},
{
"epoch": 0.9510086455331412,
"grad_norm": 20.47235892448916,
"learning_rate": 4.3170750338871806e-08,
"logits/chosen": -2.015406847000122,
"logits/rejected": -2.0090079307556152,
"logps/chosen": -1.077161431312561,
"logps/rejected": -1.2194509506225586,
"loss": 1.167,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.154322862625122,
"rewards/margins": 0.28457918763160706,
"rewards/rejected": -2.438901901245117,
"step": 1320
},
{
"epoch": 0.9582132564841499,
"grad_norm": 14.760860419836652,
"learning_rate": 4.3026151493252414e-08,
"logits/chosen": -2.04630446434021,
"logits/rejected": -2.0420799255371094,
"logps/chosen": -1.0609397888183594,
"logps/rejected": -1.182420253753662,
"loss": 1.1998,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1218795776367188,
"rewards/margins": 0.24296097457408905,
"rewards/rejected": -2.364840507507324,
"step": 1330
},
{
"epoch": 0.9654178674351584,
"grad_norm": 25.51297506847031,
"learning_rate": 4.2880285483616895e-08,
"logits/chosen": -2.006889820098877,
"logits/rejected": -2.007575750350952,
"logps/chosen": -1.0171369314193726,
"logps/rejected": -1.1325743198394775,
"loss": 1.2089,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.034273862838745,
"rewards/margins": 0.23087477684020996,
"rewards/rejected": -2.265148639678955,
"step": 1340
},
{
"epoch": 0.9726224783861671,
"grad_norm": 16.088184668896073,
"learning_rate": 4.273316256374342e-08,
"logits/chosen": -1.940446138381958,
"logits/rejected": -1.9386374950408936,
"logps/chosen": -1.0138260126113892,
"logps/rejected": -1.0874342918395996,
"loss": 1.2632,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.0276520252227783,
"rewards/margins": 0.1472165733575821,
"rewards/rejected": -2.174868583679199,
"step": 1350
},
{
"epoch": 0.9798270893371758,
"grad_norm": 16.07758733928266,
"learning_rate": 4.258479307576576e-08,
"logits/chosen": -1.9868743419647217,
"logits/rejected": -1.9846910238265991,
"logps/chosen": -0.9640612602233887,
"logps/rejected": -1.0554001331329346,
"loss": 1.2393,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9281225204467773,
"rewards/margins": 0.18267770111560822,
"rewards/rejected": -2.110800266265869,
"step": 1360
},
{
"epoch": 0.9870317002881844,
"grad_norm": 21.299668948105722,
"learning_rate": 4.243518744944626e-08,
"logits/chosen": -2.015906572341919,
"logits/rejected": -2.0112671852111816,
"logps/chosen": -1.0006954669952393,
"logps/rejected": -1.1211137771606445,
"loss": 1.189,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0013909339904785,
"rewards/margins": 0.24083688855171204,
"rewards/rejected": -2.242227554321289,
"step": 1370
},
{
"epoch": 0.9942363112391931,
"grad_norm": 20.892427881150027,
"learning_rate": 4.22843562014427e-08,
"logits/chosen": -1.9761593341827393,
"logits/rejected": -1.9725821018218994,
"logps/chosen": -1.0507876873016357,
"logps/rejected": -1.1257398128509521,
"loss": 1.2494,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1015753746032715,
"rewards/margins": 0.14990456402301788,
"rewards/rejected": -2.2514796257019043,
"step": 1380
},
{
"epoch": 1.0014409221902016,
"grad_norm": 27.91445112679855,
"learning_rate": 4.2132309934569e-08,
"logits/chosen": -2.0479187965393066,
"logits/rejected": -2.048383951187134,
"logps/chosen": -1.0160915851593018,
"logps/rejected": -1.1285258531570435,
"loss": 1.211,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0321831703186035,
"rewards/margins": 0.22486881911754608,
"rewards/rejected": -2.257051706314087,
"step": 1390
},
{
"epoch": 1.0086455331412103,
"grad_norm": 18.441960013603726,
"learning_rate": 4.197905933704989e-08,
"logits/chosen": -1.9482128620147705,
"logits/rejected": -1.9455543756484985,
"logps/chosen": -1.0604915618896484,
"logps/rejected": -1.1943556070327759,
"loss": 1.2012,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.120983123779297,
"rewards/margins": 0.2677280306816101,
"rewards/rejected": -2.3887112140655518,
"step": 1400
},
{
"epoch": 1.015850144092219,
"grad_norm": 23.66473994264621,
"learning_rate": 4.1824615181769577e-08,
"logits/chosen": -1.9916549921035767,
"logits/rejected": -1.9958763122558594,
"logps/chosen": -1.0126399993896484,
"logps/rejected": -1.138517141342163,
"loss": 1.2028,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.025279998779297,
"rewards/margins": 0.25175410509109497,
"rewards/rejected": -2.277034282684326,
"step": 1410
},
{
"epoch": 1.0230547550432276,
"grad_norm": 18.457666274318385,
"learning_rate": 4.1668988325514434e-08,
"logits/chosen": -2.015357494354248,
"logits/rejected": -2.0103044509887695,
"logps/chosen": -1.1170918941497803,
"logps/rejected": -1.232860803604126,
"loss": 1.2242,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.2341837882995605,
"rewards/margins": 0.23153769969940186,
"rewards/rejected": -2.465721607208252,
"step": 1420
},
{
"epoch": 1.0302593659942363,
"grad_norm": 21.01670306409276,
"learning_rate": 4.1512189708209844e-08,
"logits/chosen": -2.0597169399261475,
"logits/rejected": -2.058657169342041,
"logps/chosen": -0.9408125877380371,
"logps/rejected": -1.027007818222046,
"loss": 1.2466,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.8816251754760742,
"rewards/margins": 0.17239060997962952,
"rewards/rejected": -2.054015636444092,
"step": 1430
},
{
"epoch": 1.037463976945245,
"grad_norm": 22.302085677116274,
"learning_rate": 4.1354230352151143e-08,
"logits/chosen": -2.0084290504455566,
"logits/rejected": -2.0017716884613037,
"logps/chosen": -1.1378480195999146,
"logps/rejected": -1.2201100587844849,
"loss": 1.2575,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.275696039199829,
"rewards/margins": 0.1645239144563675,
"rewards/rejected": -2.4402201175689697,
"step": 1440
},
{
"epoch": 1.0446685878962536,
"grad_norm": 16.946791101993835,
"learning_rate": 4.119512136122882e-08,
"logits/chosen": -2.07346773147583,
"logits/rejected": -2.0827276706695557,
"logps/chosen": -0.9949871897697449,
"logps/rejected": -1.1448405981063843,
"loss": 1.1711,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9899743795394897,
"rewards/margins": 0.2997070550918579,
"rewards/rejected": -2.2896811962127686,
"step": 1450
},
{
"epoch": 1.0518731988472623,
"grad_norm": 15.526038466199521,
"learning_rate": 4.103487392014795e-08,
"logits/chosen": -1.9936816692352295,
"logits/rejected": -1.9814279079437256,
"logps/chosen": -1.0004615783691406,
"logps/rejected": -1.1593652963638306,
"loss": 1.1449,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0009231567382812,
"rewards/margins": 0.31780725717544556,
"rewards/rejected": -2.318730592727661,
"step": 1460
},
{
"epoch": 1.059077809798271,
"grad_norm": 16.90228466492342,
"learning_rate": 4.087349929364192e-08,
"logits/chosen": -2.027029514312744,
"logits/rejected": -2.017503261566162,
"logps/chosen": -0.9608215093612671,
"logps/rejected": -1.091578722000122,
"loss": 1.1869,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.9216430187225342,
"rewards/margins": 0.261514276266098,
"rewards/rejected": -2.183157444000244,
"step": 1470
},
{
"epoch": 1.0662824207492796,
"grad_norm": 17.442053462217785,
"learning_rate": 4.0711008825680645e-08,
"logits/chosen": -1.9791135787963867,
"logits/rejected": -1.978002905845642,
"logps/chosen": -1.006446123123169,
"logps/rejected": -1.1246168613433838,
"loss": 1.2067,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.012892246246338,
"rewards/margins": 0.23634123802185059,
"rewards/rejected": -2.2492337226867676,
"step": 1480
},
{
"epoch": 1.0734870317002883,
"grad_norm": 19.86972344245805,
"learning_rate": 4.054741393867306e-08,
"logits/chosen": -1.994312047958374,
"logits/rejected": -1.9914157390594482,
"logps/chosen": -1.1115689277648926,
"logps/rejected": -1.1622049808502197,
"loss": 1.2879,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.223137855529785,
"rewards/margins": 0.10127194970846176,
"rewards/rejected": -2.3244099617004395,
"step": 1490
},
{
"epoch": 1.080691642651297,
"grad_norm": 18.714554731415358,
"learning_rate": 4.038272613266419e-08,
"logits/chosen": -2.0033118724823,
"logits/rejected": -1.9902782440185547,
"logps/chosen": -1.0098048448562622,
"logps/rejected": -1.1201963424682617,
"loss": 1.2025,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0196096897125244,
"rewards/margins": 0.2207828313112259,
"rewards/rejected": -2.2403926849365234,
"step": 1500
},
{
"epoch": 1.0878962536023056,
"grad_norm": 18.243834119172774,
"learning_rate": 4.0216956984526784e-08,
"logits/chosen": -2.0470855236053467,
"logits/rejected": -2.049050807952881,
"logps/chosen": -1.0156313180923462,
"logps/rejected": -1.1249277591705322,
"loss": 1.2154,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0312626361846924,
"rewards/margins": 0.2185930460691452,
"rewards/rejected": -2.2498555183410645,
"step": 1510
},
{
"epoch": 1.0951008645533142,
"grad_norm": 16.15363780677068,
"learning_rate": 4.0050118147147446e-08,
"logits/chosen": -1.9841066598892212,
"logits/rejected": -1.9844478368759155,
"logps/chosen": -1.0981109142303467,
"logps/rejected": -1.1102923154830933,
"loss": 1.3395,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -2.1962218284606934,
"rewards/margins": 0.024362847208976746,
"rewards/rejected": -2.2205846309661865,
"step": 1520
},
{
"epoch": 1.1023054755043227,
"grad_norm": 17.76262200063469,
"learning_rate": 3.988222134860755e-08,
"logits/chosen": -2.029658317565918,
"logits/rejected": -2.020962953567505,
"logps/chosen": -0.9501702189445496,
"logps/rejected": -1.1164584159851074,
"loss": 1.1391,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.9003404378890991,
"rewards/margins": 0.33257636427879333,
"rewards/rejected": -2.232916831970215,
"step": 1530
},
{
"epoch": 1.1095100864553313,
"grad_norm": 23.81823709896279,
"learning_rate": 3.9713278391358724e-08,
"logits/chosen": -2.0359702110290527,
"logits/rejected": -2.0298221111297607,
"logps/chosen": -1.0248148441314697,
"logps/rejected": -1.1481475830078125,
"loss": 1.1877,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0496296882629395,
"rewards/margins": 0.24666526913642883,
"rewards/rejected": -2.296295166015625,
"step": 1540
},
{
"epoch": 1.11671469740634,
"grad_norm": 17.830908151102502,
"learning_rate": 3.954330115139328e-08,
"logits/chosen": -2.015063762664795,
"logits/rejected": -2.0099833011627197,
"logps/chosen": -1.0277677774429321,
"logps/rejected": -1.1327736377716064,
"loss": 1.2221,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0555355548858643,
"rewards/margins": 0.21001163125038147,
"rewards/rejected": -2.265547275543213,
"step": 1550
},
{
"epoch": 1.1239193083573487,
"grad_norm": 25.68887769776998,
"learning_rate": 3.937230157740931e-08,
"logits/chosen": -2.070219039916992,
"logits/rejected": -2.064025640487671,
"logps/chosen": -1.0480725765228271,
"logps/rejected": -1.1831330060958862,
"loss": 1.1827,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0961451530456543,
"rewards/margins": 0.27012091875076294,
"rewards/rejected": -2.3662660121917725,
"step": 1560
},
{
"epoch": 1.1311239193083573,
"grad_norm": 16.154765963959324,
"learning_rate": 3.920029168997077e-08,
"logits/chosen": -2.0501182079315186,
"logits/rejected": -2.048215389251709,
"logps/chosen": -1.0040074586868286,
"logps/rejected": -1.1317455768585205,
"loss": 1.1863,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0080149173736572,
"rewards/margins": 0.25547635555267334,
"rewards/rejected": -2.263491153717041,
"step": 1570
},
{
"epoch": 1.138328530259366,
"grad_norm": 29.765979988811136,
"learning_rate": 3.9027283580662476e-08,
"logits/chosen": -2.0178141593933105,
"logits/rejected": -2.0118331909179688,
"logps/chosen": -1.047828197479248,
"logps/rejected": -1.193880319595337,
"loss": 1.1762,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.095656394958496,
"rewards/margins": 0.2921043336391449,
"rewards/rejected": -2.387760639190674,
"step": 1580
},
{
"epoch": 1.1455331412103746,
"grad_norm": 16.865277551940466,
"learning_rate": 3.885328941124014e-08,
"logits/chosen": -1.9888120889663696,
"logits/rejected": -1.9842865467071533,
"logps/chosen": -0.9665737152099609,
"logps/rejected": -1.1005498170852661,
"loss": 1.1706,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9331474304199219,
"rewards/margins": 0.2679522633552551,
"rewards/rejected": -2.2010996341705322,
"step": 1590
},
{
"epoch": 1.1527377521613833,
"grad_norm": 20.89972191635711,
"learning_rate": 3.867832141277539e-08,
"logits/chosen": -2.0299296379089355,
"logits/rejected": -2.020932912826538,
"logps/chosen": -1.0687669515609741,
"logps/rejected": -1.1805663108825684,
"loss": 1.2101,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1375339031219482,
"rewards/margins": 0.2235983908176422,
"rewards/rejected": -2.3611326217651367,
"step": 1600
},
{
"epoch": 1.159942363112392,
"grad_norm": 20.78071767638211,
"learning_rate": 3.850239188479606e-08,
"logits/chosen": -1.9834659099578857,
"logits/rejected": -1.9868577718734741,
"logps/chosen": -1.0097862482070923,
"logps/rejected": -1.1004573106765747,
"loss": 1.237,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.0195724964141846,
"rewards/margins": 0.18134194612503052,
"rewards/rejected": -2.2009146213531494,
"step": 1610
},
{
"epoch": 1.1671469740634006,
"grad_norm": 22.02097078416428,
"learning_rate": 3.832551319442151e-08,
"logits/chosen": -2.057338237762451,
"logits/rejected": -2.0585570335388184,
"logps/chosen": -1.057908296585083,
"logps/rejected": -1.1848082542419434,
"loss": 1.1897,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.115816593170166,
"rewards/margins": 0.25380033254623413,
"rewards/rejected": -2.3696165084838867,
"step": 1620
},
{
"epoch": 1.1743515850144093,
"grad_norm": 17.325160620804777,
"learning_rate": 3.81476977754933e-08,
"logits/chosen": -1.9559204578399658,
"logits/rejected": -1.952262282371521,
"logps/chosen": -1.0270769596099854,
"logps/rejected": -1.0972059965133667,
"loss": 1.2578,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0541539192199707,
"rewards/margins": 0.14025799930095673,
"rewards/rejected": -2.1944119930267334,
"step": 1630
},
{
"epoch": 1.181556195965418,
"grad_norm": 16.860559579230735,
"learning_rate": 3.796895812770114e-08,
"logits/chosen": -1.9805179834365845,
"logits/rejected": -1.981414794921875,
"logps/chosen": -1.0173685550689697,
"logps/rejected": -1.1094672679901123,
"loss": 1.2405,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0347371101379395,
"rewards/margins": 0.18419703841209412,
"rewards/rejected": -2.2189345359802246,
"step": 1640
},
{
"epoch": 1.1887608069164266,
"grad_norm": 22.22569351927079,
"learning_rate": 3.7789306815704216e-08,
"logits/chosen": -2.010031223297119,
"logits/rejected": -2.0077967643737793,
"logps/chosen": -1.0069730281829834,
"logps/rejected": -1.0787549018859863,
"loss": 1.261,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.013946056365967,
"rewards/margins": 0.14356335997581482,
"rewards/rejected": -2.1575098037719727,
"step": 1650
},
{
"epoch": 1.195965417867435,
"grad_norm": 18.941008996813895,
"learning_rate": 3.760875646824795e-08,
"logits/chosen": -1.9386460781097412,
"logits/rejected": -1.942348837852478,
"logps/chosen": -0.9752788543701172,
"logps/rejected": -1.0792890787124634,
"loss": 1.2239,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9505577087402344,
"rewards/margins": 0.20802041888237,
"rewards/rejected": -2.1585781574249268,
"step": 1660
},
{
"epoch": 1.2031700288184437,
"grad_norm": 22.29470132845054,
"learning_rate": 3.742731977727623e-08,
"logits/chosen": -2.031289577484131,
"logits/rejected": -2.028223991394043,
"logps/chosen": -1.0405927896499634,
"logps/rejected": -1.1778171062469482,
"loss": 1.1781,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0811855792999268,
"rewards/margins": 0.27444881200790405,
"rewards/rejected": -2.3556342124938965,
"step": 1670
},
{
"epoch": 1.2103746397694524,
"grad_norm": 19.668800943191464,
"learning_rate": 3.7245009497039244e-08,
"logits/chosen": -1.9710372686386108,
"logits/rejected": -1.9631189107894897,
"logps/chosen": -1.0121662616729736,
"logps/rejected": -1.1485233306884766,
"loss": 1.1722,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0243325233459473,
"rewards/margins": 0.2727140784263611,
"rewards/rejected": -2.297046661376953,
"step": 1680
},
{
"epoch": 1.217579250720461,
"grad_norm": 18.855322537148837,
"learning_rate": 3.7061838443196886e-08,
"logits/chosen": -2.0141379833221436,
"logits/rejected": -2.0157604217529297,
"logps/chosen": -1.0264530181884766,
"logps/rejected": -1.149954080581665,
"loss": 1.1888,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.052906036376953,
"rewards/margins": 0.24700184166431427,
"rewards/rejected": -2.29990816116333,
"step": 1690
},
{
"epoch": 1.2247838616714697,
"grad_norm": 22.698689716068593,
"learning_rate": 3.68778194919179e-08,
"logits/chosen": -1.984043836593628,
"logits/rejected": -1.9850364923477173,
"logps/chosen": -1.0795161724090576,
"logps/rejected": -1.2015224695205688,
"loss": 1.195,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1590323448181152,
"rewards/margins": 0.24401259422302246,
"rewards/rejected": -2.4030449390411377,
"step": 1700
},
{
"epoch": 1.2319884726224783,
"grad_norm": 20.214389140478467,
"learning_rate": 3.66929655789747e-08,
"logits/chosen": -2.0348191261291504,
"logits/rejected": -2.023660659790039,
"logps/chosen": -0.9398587346076965,
"logps/rejected": -1.0924385786056519,
"loss": 1.1626,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.879717469215393,
"rewards/margins": 0.3051597476005554,
"rewards/rejected": -2.1848771572113037,
"step": 1710
},
{
"epoch": 1.239193083573487,
"grad_norm": 16.465610751100254,
"learning_rate": 3.6507289698834064e-08,
"logits/chosen": -1.9764940738677979,
"logits/rejected": -1.9729808568954468,
"logps/chosen": -0.9838182330131531,
"logps/rejected": -1.1163631677627563,
"loss": 1.1955,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.9676364660263062,
"rewards/margins": 0.2650895118713379,
"rewards/rejected": -2.2327263355255127,
"step": 1720
},
{
"epoch": 1.2463976945244957,
"grad_norm": 25.365894303851952,
"learning_rate": 3.6320804903743684e-08,
"logits/chosen": -2.0223116874694824,
"logits/rejected": -2.0218966007232666,
"logps/chosen": -1.0339914560317993,
"logps/rejected": -1.159183144569397,
"loss": 1.1983,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0679829120635986,
"rewards/margins": 0.250383585691452,
"rewards/rejected": -2.318366289138794,
"step": 1730
},
{
"epoch": 1.2536023054755043,
"grad_norm": 17.275189136976564,
"learning_rate": 3.61335243028146e-08,
"logits/chosen": -2.011654853820801,
"logits/rejected": -2.01637601852417,
"logps/chosen": -1.0918588638305664,
"logps/rejected": -1.2234910726547241,
"loss": 1.1905,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.183717727661133,
"rewards/margins": 0.26326465606689453,
"rewards/rejected": -2.4469821453094482,
"step": 1740
},
{
"epoch": 1.260806916426513,
"grad_norm": 18.5007118428664,
"learning_rate": 3.5945461061099736e-08,
"logits/chosen": -1.9712812900543213,
"logits/rejected": -1.9578218460083008,
"logps/chosen": -1.0444309711456299,
"logps/rejected": -1.1218526363372803,
"loss": 1.2707,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.0888619422912598,
"rewards/margins": 0.1548432558774948,
"rewards/rejected": -2.2437052726745605,
"step": 1750
},
{
"epoch": 1.2680115273775217,
"grad_norm": 19.826773581846037,
"learning_rate": 3.5756628398668446e-08,
"logits/chosen": -2.0560269355773926,
"logits/rejected": -2.061149835586548,
"logps/chosen": -1.1327307224273682,
"logps/rejected": -1.2321850061416626,
"loss": 1.2409,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.2654614448547363,
"rewards/margins": 0.1989085078239441,
"rewards/rejected": -2.464370012283325,
"step": 1760
},
{
"epoch": 1.2752161383285303,
"grad_norm": 17.659111449492986,
"learning_rate": 3.556703958967716e-08,
"logits/chosen": -2.042252779006958,
"logits/rejected": -2.0375871658325195,
"logps/chosen": -1.051990270614624,
"logps/rejected": -1.1855313777923584,
"loss": 1.1891,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.103980541229248,
"rewards/margins": 0.2670823037624359,
"rewards/rejected": -2.371062755584717,
"step": 1770
},
{
"epoch": 1.282420749279539,
"grad_norm": 24.065221888255977,
"learning_rate": 3.5376707961436297e-08,
"logits/chosen": -2.0278120040893555,
"logits/rejected": -2.022207498550415,
"logps/chosen": -1.1405036449432373,
"logps/rejected": -1.2026771306991577,
"loss": 1.2719,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2810072898864746,
"rewards/margins": 0.12434691190719604,
"rewards/rejected": -2.4053542613983154,
"step": 1780
},
{
"epoch": 1.2896253602305476,
"grad_norm": 12.853822919695737,
"learning_rate": 3.51856468934734e-08,
"logits/chosen": -1.9812190532684326,
"logits/rejected": -1.9826923608779907,
"logps/chosen": -0.9758992195129395,
"logps/rejected": -1.0700486898422241,
"loss": 1.2226,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.951798439025879,
"rewards/margins": 0.18829897046089172,
"rewards/rejected": -2.1400973796844482,
"step": 1790
},
{
"epoch": 1.2968299711815563,
"grad_norm": 20.14401512444606,
"learning_rate": 3.499386981659262e-08,
"logits/chosen": -2.0630898475646973,
"logits/rejected": -2.0576171875,
"logps/chosen": -1.018842101097107,
"logps/rejected": -1.209272027015686,
"loss": 1.1238,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.037684202194214,
"rewards/margins": 0.3808597922325134,
"rewards/rejected": -2.418544054031372,
"step": 1800
},
{
"epoch": 1.304034582132565,
"grad_norm": 20.90969448735332,
"learning_rate": 3.480139021193057e-08,
"logits/chosen": -1.9834129810333252,
"logits/rejected": -1.985131859779358,
"logps/chosen": -0.9966486692428589,
"logps/rejected": -1.1168452501296997,
"loss": 1.2127,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.9932973384857178,
"rewards/margins": 0.24039287865161896,
"rewards/rejected": -2.2336905002593994,
"step": 1810
},
{
"epoch": 1.3112391930835736,
"grad_norm": 28.592045906928604,
"learning_rate": 3.4608221610008666e-08,
"logits/chosen": -2.018594264984131,
"logits/rejected": -2.0142102241516113,
"logps/chosen": -0.9736968278884888,
"logps/rejected": -1.1203354597091675,
"loss": 1.1713,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9473936557769775,
"rewards/margins": 0.2932773232460022,
"rewards/rejected": -2.240670919418335,
"step": 1820
},
{
"epoch": 1.318443804034582,
"grad_norm": 15.191719500704991,
"learning_rate": 3.4414377589782e-08,
"logits/chosen": -1.9855458736419678,
"logits/rejected": -1.9946119785308838,
"logps/chosen": -1.0181246995925903,
"logps/rejected": -1.1509206295013428,
"loss": 1.1963,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.0362493991851807,
"rewards/margins": 0.2655918300151825,
"rewards/rejected": -2.3018412590026855,
"step": 1830
},
{
"epoch": 1.3256484149855907,
"grad_norm": 18.19541860204369,
"learning_rate": 3.4219871777684745e-08,
"logits/chosen": -1.9971675872802734,
"logits/rejected": -1.984905481338501,
"logps/chosen": -0.9933854937553406,
"logps/rejected": -1.1145892143249512,
"loss": 1.2078,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9867709875106812,
"rewards/margins": 0.24240756034851074,
"rewards/rejected": -2.2291784286499023,
"step": 1840
},
{
"epoch": 1.3328530259365994,
"grad_norm": 17.72178124025082,
"learning_rate": 3.4024717846672364e-08,
"logits/chosen": -2.0331177711486816,
"logits/rejected": -2.026477336883545,
"logps/chosen": -0.9942334890365601,
"logps/rejected": -1.1216745376586914,
"loss": 1.1967,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9884669780731201,
"rewards/margins": 0.25488215684890747,
"rewards/rejected": -2.243349075317383,
"step": 1850
},
{
"epoch": 1.340057636887608,
"grad_norm": 17.801651438890392,
"learning_rate": 3.382892951526036e-08,
"logits/chosen": -2.018220901489258,
"logits/rejected": -2.0154216289520264,
"logps/chosen": -1.0521572828292847,
"logps/rejected": -1.2008370161056519,
"loss": 1.163,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1043145656585693,
"rewards/margins": 0.2973593771457672,
"rewards/rejected": -2.4016740322113037,
"step": 1860
},
{
"epoch": 1.3472622478386167,
"grad_norm": 20.318968041064025,
"learning_rate": 3.3632520546559974e-08,
"logits/chosen": -1.9867897033691406,
"logits/rejected": -1.9751968383789062,
"logps/chosen": -0.926361083984375,
"logps/rejected": -1.0954601764678955,
"loss": 1.1271,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.85272216796875,
"rewards/margins": 0.3381980061531067,
"rewards/rejected": -2.190920352935791,
"step": 1870
},
{
"epoch": 1.3544668587896254,
"grad_norm": 19.630592165862417,
"learning_rate": 3.34355047473107e-08,
"logits/chosen": -2.0014548301696777,
"logits/rejected": -1.997385025024414,
"logps/chosen": -1.0289537906646729,
"logps/rejected": -1.118239164352417,
"loss": 1.245,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0579075813293457,
"rewards/margins": 0.1785707026720047,
"rewards/rejected": -2.236478328704834,
"step": 1880
},
{
"epoch": 1.361671469740634,
"grad_norm": 22.9649823694943,
"learning_rate": 3.323789596690971e-08,
"logits/chosen": -1.9707273244857788,
"logits/rejected": -1.9716689586639404,
"logps/chosen": -1.0208336114883423,
"logps/rejected": -1.1546828746795654,
"loss": 1.1792,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0416672229766846,
"rewards/margins": 0.26769858598709106,
"rewards/rejected": -2.309365749359131,
"step": 1890
},
{
"epoch": 1.3688760806916427,
"grad_norm": 15.719014755563348,
"learning_rate": 3.303970809643828e-08,
"logits/chosen": -2.000805139541626,
"logits/rejected": -2.0052528381347656,
"logps/chosen": -1.0358805656433105,
"logps/rejected": -1.164954423904419,
"loss": 1.1925,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.071761131286621,
"rewards/margins": 0.2581479847431183,
"rewards/rejected": -2.329908847808838,
"step": 1900
},
{
"epoch": 1.3760806916426513,
"grad_norm": 20.784381707823112,
"learning_rate": 3.2840955067685356e-08,
"logits/chosen": -2.0275561809539795,
"logits/rejected": -2.031751871109009,
"logps/chosen": -1.0541309118270874,
"logps/rejected": -1.2037460803985596,
"loss": 1.1612,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.108261823654175,
"rewards/margins": 0.29923057556152344,
"rewards/rejected": -2.407492160797119,
"step": 1910
},
{
"epoch": 1.38328530259366,
"grad_norm": 16.91937122091795,
"learning_rate": 3.264165085216817e-08,
"logits/chosen": -2.0380663871765137,
"logits/rejected": -2.0380921363830566,
"logps/chosen": -0.9351627230644226,
"logps/rejected": -1.1040947437286377,
"loss": 1.1393,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.8703254461288452,
"rewards/margins": 0.3378642499446869,
"rewards/rejected": -2.2081894874572754,
"step": 1920
},
{
"epoch": 1.3904899135446687,
"grad_norm": 18.773223856961657,
"learning_rate": 3.244180946015008e-08,
"logits/chosen": -1.9662561416625977,
"logits/rejected": -1.96682608127594,
"logps/chosen": -1.0346488952636719,
"logps/rejected": -1.0978872776031494,
"loss": 1.2739,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0692977905273438,
"rewards/margins": 0.12647677958011627,
"rewards/rejected": -2.195774555206299,
"step": 1930
},
{
"epoch": 1.397694524495677,
"grad_norm": 15.42949606476335,
"learning_rate": 3.224144493965578e-08,
"logits/chosen": -2.0522544384002686,
"logits/rejected": -2.0557808876037598,
"logps/chosen": -0.9907134175300598,
"logps/rejected": -1.095879316329956,
"loss": 1.217,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9814268350601196,
"rewards/margins": 0.2103317677974701,
"rewards/rejected": -2.191758632659912,
"step": 1940
},
{
"epoch": 1.4048991354466858,
"grad_norm": 17.867258553909902,
"learning_rate": 3.204057137548371e-08,
"logits/chosen": -2.0167171955108643,
"logits/rejected": -2.011385202407837,
"logps/chosen": -0.9775940179824829,
"logps/rejected": -1.0835435390472412,
"loss": 1.2152,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9551880359649658,
"rewards/margins": 0.21189892292022705,
"rewards/rejected": -2.1670870780944824,
"step": 1950
},
{
"epoch": 1.4121037463976944,
"grad_norm": 19.490021831335156,
"learning_rate": 3.183920288821597e-08,
"logits/chosen": -1.9968830347061157,
"logits/rejected": -1.993549108505249,
"logps/chosen": -1.0021111965179443,
"logps/rejected": -1.163733959197998,
"loss": 1.1461,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0042223930358887,
"rewards/margins": 0.3232455551624298,
"rewards/rejected": -2.327467918395996,
"step": 1960
},
{
"epoch": 1.419308357348703,
"grad_norm": 23.55301780513215,
"learning_rate": 3.1637353633225735e-08,
"logits/chosen": -2.042677879333496,
"logits/rejected": -2.0366151332855225,
"logps/chosen": -1.0290377140045166,
"logps/rejected": -1.1746978759765625,
"loss": 1.1717,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.058075428009033,
"rewards/margins": 0.2913200259208679,
"rewards/rejected": -2.349395751953125,
"step": 1970
},
{
"epoch": 1.4265129682997117,
"grad_norm": 19.629394462964214,
"learning_rate": 3.143503779968213e-08,
"logits/chosen": -2.011504650115967,
"logits/rejected": -2.011737823486328,
"logps/chosen": -1.015564203262329,
"logps/rejected": -1.1511462926864624,
"loss": 1.196,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.031128406524658,
"rewards/margins": 0.27116426825523376,
"rewards/rejected": -2.302292585372925,
"step": 1980
},
{
"epoch": 1.4337175792507204,
"grad_norm": 18.105081870748133,
"learning_rate": 3.1232269609552875e-08,
"logits/chosen": -1.9979522228240967,
"logits/rejected": -1.995548963546753,
"logps/chosen": -0.9974485635757446,
"logps/rejected": -1.1187690496444702,
"loss": 1.2004,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9948971271514893,
"rewards/margins": 0.24264100193977356,
"rewards/rejected": -2.2375380992889404,
"step": 1990
},
{
"epoch": 1.440922190201729,
"grad_norm": 16.05671541036881,
"learning_rate": 3.102906331660444e-08,
"logits/chosen": -2.0580544471740723,
"logits/rejected": -2.0497653484344482,
"logps/chosen": -0.9930634498596191,
"logps/rejected": -1.1649951934814453,
"loss": 1.1345,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9861268997192383,
"rewards/margins": 0.3438633382320404,
"rewards/rejected": -2.3299903869628906,
"step": 2000
},
{
"epoch": 1.4481268011527377,
"grad_norm": 16.003021454074023,
"learning_rate": 3.082543320540015e-08,
"logits/chosen": -1.9997708797454834,
"logits/rejected": -1.992846131324768,
"logps/chosen": -1.0060840845108032,
"logps/rejected": -1.1501317024230957,
"loss": 1.1672,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0121681690216064,
"rewards/margins": 0.28809523582458496,
"rewards/rejected": -2.3002634048461914,
"step": 2010
},
{
"epoch": 1.4553314121037464,
"grad_norm": 17.99920026821121,
"learning_rate": 3.062139359029599e-08,
"logits/chosen": -2.029757022857666,
"logits/rejected": -2.029585361480713,
"logps/chosen": -1.0290124416351318,
"logps/rejected": -1.1135772466659546,
"loss": 1.2472,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.0580248832702637,
"rewards/margins": 0.16912977397441864,
"rewards/rejected": -2.227154493331909,
"step": 2020
},
{
"epoch": 1.462536023054755,
"grad_norm": 18.76605541973094,
"learning_rate": 3.041695881443437e-08,
"logits/chosen": -2.051182270050049,
"logits/rejected": -2.04660964012146,
"logps/chosen": -0.9734565019607544,
"logps/rejected": -1.1085374355316162,
"loss": 1.1781,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.9469130039215088,
"rewards/margins": 0.27016210556030273,
"rewards/rejected": -2.2170748710632324,
"step": 2030
},
{
"epoch": 1.4697406340057637,
"grad_norm": 22.16241778931301,
"learning_rate": 3.0212143248735886e-08,
"logits/chosen": -2.0314226150512695,
"logits/rejected": -2.0317978858947754,
"logps/chosen": -0.9990888833999634,
"logps/rejected": -1.1364792585372925,
"loss": 1.1757,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9981777667999268,
"rewards/margins": 0.2747807502746582,
"rewards/rejected": -2.272958517074585,
"step": 2040
},
{
"epoch": 1.4769452449567724,
"grad_norm": 19.71057724214497,
"learning_rate": 3.0006961290889077e-08,
"logits/chosen": -2.0183329582214355,
"logits/rejected": -2.009127378463745,
"logps/chosen": -1.1189202070236206,
"logps/rejected": -1.286902904510498,
"loss": 1.1652,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.237840414047241,
"rewards/margins": 0.33596524596214294,
"rewards/rejected": -2.573805809020996,
"step": 2050
},
{
"epoch": 1.484149855907781,
"grad_norm": 21.95290350074783,
"learning_rate": 2.980142736433833e-08,
"logits/chosen": -2.008192777633667,
"logits/rejected": -2.001173496246338,
"logps/chosen": -1.0314289331436157,
"logps/rejected": -1.0944207906723022,
"loss": 1.2767,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.0628578662872314,
"rewards/margins": 0.12598386406898499,
"rewards/rejected": -2.1888415813446045,
"step": 2060
},
{
"epoch": 1.4913544668587897,
"grad_norm": 24.378073368269256,
"learning_rate": 2.9595555917269997e-08,
"logits/chosen": -2.039536952972412,
"logits/rejected": -2.0248141288757324,
"logps/chosen": -1.140825867652893,
"logps/rejected": -1.2375624179840088,
"loss": 1.2147,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.281651735305786,
"rewards/margins": 0.19347305595874786,
"rewards/rejected": -2.4751248359680176,
"step": 2070
},
{
"epoch": 1.4985590778097984,
"grad_norm": 18.478093665467945,
"learning_rate": 2.9389361421596725e-08,
"logits/chosen": -1.9539821147918701,
"logits/rejected": -1.9563089609146118,
"logps/chosen": -1.0598758459091187,
"logps/rejected": -1.1930882930755615,
"loss": 1.1842,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.1197516918182373,
"rewards/margins": 0.26642483472824097,
"rewards/rejected": -2.386176586151123,
"step": 2080
},
{
"epoch": 1.505763688760807,
"grad_norm": 20.36561616479061,
"learning_rate": 2.9182858371940126e-08,
"logits/chosen": -2.0380711555480957,
"logits/rejected": -2.032642364501953,
"logps/chosen": -1.046942949295044,
"logps/rejected": -1.1760826110839844,
"loss": 1.1872,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.093885898590088,
"rewards/margins": 0.2582792639732361,
"rewards/rejected": -2.3521652221679688,
"step": 2090
},
{
"epoch": 1.5129682997118157,
"grad_norm": 18.97930654154894,
"learning_rate": 2.8976061284611908e-08,
"logits/chosen": -1.9913969039916992,
"logits/rejected": -2.0002284049987793,
"logps/chosen": -0.9360917806625366,
"logps/rejected": -1.0654878616333008,
"loss": 1.194,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.8721835613250732,
"rewards/margins": 0.2587924301624298,
"rewards/rejected": -2.1309757232666016,
"step": 2100
},
{
"epoch": 1.5201729106628243,
"grad_norm": 21.429455246868766,
"learning_rate": 2.8768984696593384e-08,
"logits/chosen": -1.978727102279663,
"logits/rejected": -1.9692051410675049,
"logps/chosen": -1.0171012878417969,
"logps/rejected": -1.1342874765396118,
"loss": 1.2159,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0342025756835938,
"rewards/margins": 0.23437246680259705,
"rewards/rejected": -2.2685749530792236,
"step": 2110
},
{
"epoch": 1.527377521613833,
"grad_norm": 17.997474835260586,
"learning_rate": 2.8561643164513637e-08,
"logits/chosen": -1.9045627117156982,
"logits/rejected": -1.900713324546814,
"logps/chosen": -1.0493916273117065,
"logps/rejected": -1.1679325103759766,
"loss": 1.2012,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.098783254623413,
"rewards/margins": 0.2370818853378296,
"rewards/rejected": -2.335865020751953,
"step": 2120
},
{
"epoch": 1.5345821325648417,
"grad_norm": 18.947013598861446,
"learning_rate": 2.8354051263626227e-08,
"logits/chosen": -1.9892809391021729,
"logits/rejected": -1.9950027465820312,
"logps/chosen": -1.0602303743362427,
"logps/rejected": -1.1739084720611572,
"loss": 1.2059,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1204607486724854,
"rewards/margins": 0.22735624015331268,
"rewards/rejected": -2.3478169441223145,
"step": 2130
},
{
"epoch": 1.54178674351585,
"grad_norm": 19.64777274554225,
"learning_rate": 2.8146223586784573e-08,
"logits/chosen": -1.980348825454712,
"logits/rejected": -1.9723879098892212,
"logps/chosen": -1.065375566482544,
"logps/rejected": -1.1992082595825195,
"loss": 1.1877,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.130751132965088,
"rewards/margins": 0.2676653265953064,
"rewards/rejected": -2.398416519165039,
"step": 2140
},
{
"epoch": 1.5489913544668588,
"grad_norm": 25.24267359813586,
"learning_rate": 2.7938174743416205e-08,
"logits/chosen": -1.9437439441680908,
"logits/rejected": -1.9405949115753174,
"logps/chosen": -1.0510722398757935,
"logps/rejected": -1.1613985300064087,
"loss": 1.2136,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.102144479751587,
"rewards/margins": 0.22065265476703644,
"rewards/rejected": -2.3227970600128174,
"step": 2150
},
{
"epoch": 1.5561959654178674,
"grad_norm": 19.677128525439006,
"learning_rate": 2.7729919358495728e-08,
"logits/chosen": -2.002791404724121,
"logits/rejected": -2.0038230419158936,
"logps/chosen": -1.112272024154663,
"logps/rejected": -1.1909116506576538,
"loss": 1.2588,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.224544048309326,
"rewards/margins": 0.15727964043617249,
"rewards/rejected": -2.3818233013153076,
"step": 2160
},
{
"epoch": 1.563400576368876,
"grad_norm": 19.250624519916794,
"learning_rate": 2.7521472071516772e-08,
"logits/chosen": -1.997267484664917,
"logits/rejected": -1.9961631298065186,
"logps/chosen": -0.9451554417610168,
"logps/rejected": -1.0597981214523315,
"loss": 1.2086,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.8903108835220337,
"rewards/margins": 0.22928544878959656,
"rewards/rejected": -2.119596242904663,
"step": 2170
},
{
"epoch": 1.5706051873198847,
"grad_norm": 21.883332512382378,
"learning_rate": 2.731284753546289e-08,
"logits/chosen": -1.987908959388733,
"logits/rejected": -1.9858767986297607,
"logps/chosen": -1.081416130065918,
"logps/rejected": -1.222598671913147,
"loss": 1.1737,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.162832260131836,
"rewards/margins": 0.28236496448516846,
"rewards/rejected": -2.445197343826294,
"step": 2180
},
{
"epoch": 1.5778097982708934,
"grad_norm": 21.934933640290577,
"learning_rate": 2.710406041577751e-08,
"logits/chosen": -2.0529561042785645,
"logits/rejected": -2.049743175506592,
"logps/chosen": -1.0325100421905518,
"logps/rejected": -1.1861141920089722,
"loss": 1.1625,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0650200843811035,
"rewards/margins": 0.3072081506252289,
"rewards/rejected": -2.3722283840179443,
"step": 2190
},
{
"epoch": 1.585014409221902,
"grad_norm": 18.04514565686392,
"learning_rate": 2.6895125389333017e-08,
"logits/chosen": -2.01184344291687,
"logits/rejected": -2.0075669288635254,
"logps/chosen": -1.026865839958191,
"logps/rejected": -1.1786205768585205,
"loss": 1.1613,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.053731679916382,
"rewards/margins": 0.30350956320762634,
"rewards/rejected": -2.357241153717041,
"step": 2200
},
{
"epoch": 1.5922190201729105,
"grad_norm": 17.097050510727,
"learning_rate": 2.6686057143399028e-08,
"logits/chosen": -2.0109105110168457,
"logits/rejected": -2.0125486850738525,
"logps/chosen": -1.0616767406463623,
"logps/rejected": -1.1599900722503662,
"loss": 1.2429,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.1233534812927246,
"rewards/margins": 0.1966264694929123,
"rewards/rejected": -2.3199801445007324,
"step": 2210
},
{
"epoch": 1.5994236311239192,
"grad_norm": 19.369226164400942,
"learning_rate": 2.647687037460996e-08,
"logits/chosen": -2.0144858360290527,
"logits/rejected": -2.0138607025146484,
"logps/chosen": -1.0874732732772827,
"logps/rejected": -1.2833433151245117,
"loss": 1.1241,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.1749465465545654,
"rewards/margins": 0.39173993468284607,
"rewards/rejected": -2.5666866302490234,
"step": 2220
},
{
"epoch": 1.6066282420749278,
"grad_norm": 20.354220577910063,
"learning_rate": 2.626757978793187e-08,
"logits/chosen": -2.025035858154297,
"logits/rejected": -2.018566846847534,
"logps/chosen": -1.0852900743484497,
"logps/rejected": -1.2093784809112549,
"loss": 1.2036,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1705801486968994,
"rewards/margins": 0.24817702174186707,
"rewards/rejected": -2.4187569618225098,
"step": 2230
},
{
"epoch": 1.6138328530259365,
"grad_norm": 23.587038296125034,
"learning_rate": 2.6058200095628797e-08,
"logits/chosen": -1.9932403564453125,
"logits/rejected": -1.9965318441390991,
"logps/chosen": -0.917451024055481,
"logps/rejected": -1.0861783027648926,
"loss": 1.1444,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.834902048110962,
"rewards/margins": 0.33745482563972473,
"rewards/rejected": -2.172356605529785,
"step": 2240
},
{
"epoch": 1.6210374639769451,
"grad_norm": 18.85455738747805,
"learning_rate": 2.584874601622854e-08,
"logits/chosen": -2.0577220916748047,
"logits/rejected": -2.048609972000122,
"logps/chosen": -1.0844237804412842,
"logps/rejected": -1.2160663604736328,
"loss": 1.2066,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1688475608825684,
"rewards/margins": 0.26328495144844055,
"rewards/rejected": -2.4321327209472656,
"step": 2250
},
{
"epoch": 1.6282420749279538,
"grad_norm": 21.402470423847422,
"learning_rate": 2.5639232273487993e-08,
"logits/chosen": -1.9839709997177124,
"logits/rejected": -1.9742904901504517,
"logps/chosen": -0.9783967137336731,
"logps/rejected": -1.1004703044891357,
"loss": 1.2014,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9567934274673462,
"rewards/margins": 0.24414721131324768,
"rewards/rejected": -2.2009406089782715,
"step": 2260
},
{
"epoch": 1.6354466858789625,
"grad_norm": 20.835507437389413,
"learning_rate": 2.5429673595358142e-08,
"logits/chosen": -2.0180060863494873,
"logits/rejected": -2.0165977478027344,
"logps/chosen": -1.044081687927246,
"logps/rejected": -1.1655093431472778,
"loss": 1.198,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.088163375854492,
"rewards/margins": 0.24285531044006348,
"rewards/rejected": -2.3310186862945557,
"step": 2270
},
{
"epoch": 1.6426512968299711,
"grad_norm": 23.508207659870397,
"learning_rate": 2.5220084712948764e-08,
"logits/chosen": -1.9833030700683594,
"logits/rejected": -1.9724918603897095,
"logps/chosen": -1.1200716495513916,
"logps/rejected": -1.2384498119354248,
"loss": 1.1901,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.240143299102783,
"rewards/margins": 0.2367565631866455,
"rewards/rejected": -2.4768996238708496,
"step": 2280
},
{
"epoch": 1.6498559077809798,
"grad_norm": 19.732301868207852,
"learning_rate": 2.5010480359492838e-08,
"logits/chosen": -1.9684820175170898,
"logits/rejected": -1.9655864238739014,
"logps/chosen": -1.0514031648635864,
"logps/rejected": -1.1109318733215332,
"loss": 1.287,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.102806329727173,
"rewards/margins": 0.11905747652053833,
"rewards/rejected": -2.2218637466430664,
"step": 2290
},
{
"epoch": 1.6570605187319885,
"grad_norm": 21.18423154150406,
"learning_rate": 2.480087526931091e-08,
"logits/chosen": -2.0103702545166016,
"logits/rejected": -1.99822998046875,
"logps/chosen": -1.0027062892913818,
"logps/rejected": -1.1175159215927124,
"loss": 1.2155,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.0054125785827637,
"rewards/margins": 0.22961954772472382,
"rewards/rejected": -2.235031843185425,
"step": 2300
},
{
"epoch": 1.6642651296829971,
"grad_norm": 19.472737759749933,
"learning_rate": 2.4591284176775326e-08,
"logits/chosen": -1.969321608543396,
"logits/rejected": -1.9655838012695312,
"logps/chosen": -1.0758287906646729,
"logps/rejected": -1.1590468883514404,
"loss": 1.2565,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.1516575813293457,
"rewards/margins": 0.16643603146076202,
"rewards/rejected": -2.318093776702881,
"step": 2310
},
{
"epoch": 1.6714697406340058,
"grad_norm": 21.89430802713767,
"learning_rate": 2.4381721815274443e-08,
"logits/chosen": -2.043560028076172,
"logits/rejected": -2.0437939167022705,
"logps/chosen": -1.0199779272079468,
"logps/rejected": -1.1517260074615479,
"loss": 1.1926,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0399558544158936,
"rewards/margins": 0.26349639892578125,
"rewards/rejected": -2.3034520149230957,
"step": 2320
},
{
"epoch": 1.6786743515850144,
"grad_norm": 19.55460769848809,
"learning_rate": 2.4172202916176936e-08,
"logits/chosen": -2.046525478363037,
"logits/rejected": -2.048698663711548,
"logps/chosen": -0.9680402874946594,
"logps/rejected": -1.1348791122436523,
"loss": 1.1576,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9360805749893188,
"rewards/margins": 0.33367738127708435,
"rewards/rejected": -2.2697582244873047,
"step": 2330
},
{
"epoch": 1.685878962536023,
"grad_norm": 19.2299692018981,
"learning_rate": 2.3962742207796268e-08,
"logits/chosen": -1.9817912578582764,
"logits/rejected": -1.9795825481414795,
"logps/chosen": -0.9565426707267761,
"logps/rejected": -1.1191436052322388,
"loss": 1.1591,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9130853414535522,
"rewards/margins": 0.325202077627182,
"rewards/rejected": -2.2382872104644775,
"step": 2340
},
{
"epoch": 1.6930835734870318,
"grad_norm": 22.728076530066133,
"learning_rate": 2.3753354414355334e-08,
"logits/chosen": -1.950277328491211,
"logits/rejected": -1.9395864009857178,
"logps/chosen": -1.0648443698883057,
"logps/rejected": -1.1816734075546265,
"loss": 1.2133,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1296887397766113,
"rewards/margins": 0.23365814983844757,
"rewards/rejected": -2.363346815109253,
"step": 2350
},
{
"epoch": 1.7002881844380404,
"grad_norm": 18.628936490431073,
"learning_rate": 2.3544054254951408e-08,
"logits/chosen": -1.9891974925994873,
"logits/rejected": -1.9805711507797241,
"logps/chosen": -0.9382593035697937,
"logps/rejected": -1.1349287033081055,
"loss": 1.1143,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.8765186071395874,
"rewards/margins": 0.3933386504650116,
"rewards/rejected": -2.269857406616211,
"step": 2360
},
{
"epoch": 1.707492795389049,
"grad_norm": 18.210045869943635,
"learning_rate": 2.3334856442521435e-08,
"logits/chosen": -2.037346124649048,
"logits/rejected": -2.0299322605133057,
"logps/chosen": -1.0963926315307617,
"logps/rejected": -1.1663601398468018,
"loss": 1.2702,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1927852630615234,
"rewards/margins": 0.1399351954460144,
"rewards/rejected": -2.3327202796936035,
"step": 2370
},
{
"epoch": 1.7146974063400577,
"grad_norm": 19.872042875254735,
"learning_rate": 2.3125775682807826e-08,
"logits/chosen": -2.0520217418670654,
"logits/rejected": -2.051881790161133,
"logps/chosen": -1.166526198387146,
"logps/rejected": -1.2666442394256592,
"loss": 1.232,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.333052396774292,
"rewards/margins": 0.20023572444915771,
"rewards/rejected": -2.5332884788513184,
"step": 2380
},
{
"epoch": 1.7219020172910664,
"grad_norm": 20.61950838692453,
"learning_rate": 2.291682667332464e-08,
"logits/chosen": -2.0643744468688965,
"logits/rejected": -2.059324264526367,
"logps/chosen": -1.0485190153121948,
"logps/rejected": -1.1792809963226318,
"loss": 1.1918,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.0970380306243896,
"rewards/margins": 0.26152390241622925,
"rewards/rejected": -2.3585619926452637,
"step": 2390
},
{
"epoch": 1.729106628242075,
"grad_norm": 15.170314603632118,
"learning_rate": 2.2708024102324454e-08,
"logits/chosen": -2.0271968841552734,
"logits/rejected": -2.0215301513671875,
"logps/chosen": -1.0329598188400269,
"logps/rejected": -1.2096232175827026,
"loss": 1.1496,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0659196376800537,
"rewards/margins": 0.3533265292644501,
"rewards/rejected": -2.4192464351654053,
"step": 2400
},
{
"epoch": 1.7363112391930837,
"grad_norm": 22.430906564092297,
"learning_rate": 2.2499382647765797e-08,
"logits/chosen": -2.0221495628356934,
"logits/rejected": -2.018479347229004,
"logps/chosen": -1.0721267461776733,
"logps/rejected": -1.1613738536834717,
"loss": 1.2462,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1442534923553467,
"rewards/margins": 0.17849409580230713,
"rewards/rejected": -2.3227477073669434,
"step": 2410
},
{
"epoch": 1.7435158501440924,
"grad_norm": 21.08508798237925,
"learning_rate": 2.2290916976281427e-08,
"logits/chosen": -2.003178119659424,
"logits/rejected": -1.996995210647583,
"logps/chosen": -0.9998496174812317,
"logps/rejected": -1.1318352222442627,
"loss": 1.2149,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9996992349624634,
"rewards/margins": 0.2639711797237396,
"rewards/rejected": -2.2636704444885254,
"step": 2420
},
{
"epoch": 1.7507204610951008,
"grad_norm": 18.07451059922042,
"learning_rate": 2.2082641742147238e-08,
"logits/chosen": -1.9808975458145142,
"logits/rejected": -1.9742380380630493,
"logps/chosen": -1.016533613204956,
"logps/rejected": -1.2077422142028809,
"loss": 1.1153,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.033067226409912,
"rewards/margins": 0.3824174702167511,
"rewards/rejected": -2.4154844284057617,
"step": 2430
},
{
"epoch": 1.7579250720461095,
"grad_norm": 20.71956116817728,
"learning_rate": 2.1874571586252177e-08,
"logits/chosen": -2.029297351837158,
"logits/rejected": -2.02240252494812,
"logps/chosen": -1.0277835130691528,
"logps/rejected": -1.1069574356079102,
"loss": 1.2557,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0555670261383057,
"rewards/margins": 0.1583479940891266,
"rewards/rejected": -2.2139148712158203,
"step": 2440
},
{
"epoch": 1.7651296829971181,
"grad_norm": 20.82700891746505,
"learning_rate": 2.1666721135069037e-08,
"logits/chosen": -2.014781951904297,
"logits/rejected": -2.0112876892089844,
"logps/chosen": -1.1093708276748657,
"logps/rejected": -1.2040464878082275,
"loss": 1.2422,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.2187416553497314,
"rewards/margins": 0.18935146927833557,
"rewards/rejected": -2.408092975616455,
"step": 2450
},
{
"epoch": 1.7723342939481268,
"grad_norm": 15.571889556211572,
"learning_rate": 2.145910499962628e-08,
"logits/chosen": -2.065460681915283,
"logits/rejected": -2.0574898719787598,
"logps/chosen": -0.9591764211654663,
"logps/rejected": -1.1014626026153564,
"loss": 1.1832,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9183528423309326,
"rewards/margins": 0.2845722734928131,
"rewards/rejected": -2.202925205230713,
"step": 2460
},
{
"epoch": 1.7795389048991355,
"grad_norm": 23.742748347267703,
"learning_rate": 2.1251737774480915e-08,
"logits/chosen": -2.0418546199798584,
"logits/rejected": -2.032393455505371,
"logps/chosen": -1.169003963470459,
"logps/rejected": -1.2600603103637695,
"loss": 1.2691,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.338007926940918,
"rewards/margins": 0.18211248517036438,
"rewards/rejected": -2.520120620727539,
"step": 2470
},
{
"epoch": 1.7867435158501441,
"grad_norm": 17.445595980137064,
"learning_rate": 2.104463403669264e-08,
"logits/chosen": -2.0002996921539307,
"logits/rejected": -1.9975649118423462,
"logps/chosen": -1.0450685024261475,
"logps/rejected": -1.189254879951477,
"loss": 1.1814,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.090137004852295,
"rewards/margins": 0.288372665643692,
"rewards/rejected": -2.378509759902954,
"step": 2480
},
{
"epoch": 1.7939481268011528,
"grad_norm": 17.073648259711092,
"learning_rate": 2.0837808344799028e-08,
"logits/chosen": -1.982496976852417,
"logits/rejected": -1.9782216548919678,
"logps/chosen": -0.9407302141189575,
"logps/rejected": -1.0727360248565674,
"loss": 1.1832,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.881460428237915,
"rewards/margins": 0.26401159167289734,
"rewards/rejected": -2.1454720497131348,
"step": 2490
},
{
"epoch": 1.8011527377521612,
"grad_norm": 18.275713028107056,
"learning_rate": 2.063127523779219e-08,
"logits/chosen": -1.9809592962265015,
"logits/rejected": -1.9768626689910889,
"logps/chosen": -1.0079666376113892,
"logps/rejected": -1.1944630146026611,
"loss": 1.1139,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0159332752227783,
"rewards/margins": 0.372992604970932,
"rewards/rejected": -2.3889260292053223,
"step": 2500
},
{
"epoch": 1.8083573487031699,
"grad_norm": 19.734595343704587,
"learning_rate": 2.0425049234096737e-08,
"logits/chosen": -1.9899470806121826,
"logits/rejected": -1.9840829372406006,
"logps/chosen": -1.0095808506011963,
"logps/rejected": -1.1258000135421753,
"loss": 1.2167,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0191617012023926,
"rewards/margins": 0.23243825137615204,
"rewards/rejected": -2.2516000270843506,
"step": 2510
},
{
"epoch": 1.8155619596541785,
"grad_norm": 19.367202320168705,
"learning_rate": 2.0219144830549163e-08,
"logits/chosen": -1.9627516269683838,
"logits/rejected": -1.9618648290634155,
"logps/chosen": -1.0147254467010498,
"logps/rejected": -1.1612242460250854,
"loss": 1.1826,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0294508934020996,
"rewards/margins": 0.29299798607826233,
"rewards/rejected": -2.322448492050171,
"step": 2520
},
{
"epoch": 1.8227665706051872,
"grad_norm": 19.099230695899127,
"learning_rate": 2.0013576501378823e-08,
"logits/chosen": -1.9792697429656982,
"logits/rejected": -1.9728553295135498,
"logps/chosen": -1.0101633071899414,
"logps/rejected": -1.144830584526062,
"loss": 1.1941,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.020326614379883,
"rewards/margins": 0.26933470368385315,
"rewards/rejected": -2.289661169052124,
"step": 2530
},
{
"epoch": 1.8299711815561959,
"grad_norm": 20.166585492166963,
"learning_rate": 1.9808358697190426e-08,
"logits/chosen": -1.972886085510254,
"logits/rejected": -1.969310998916626,
"logps/chosen": -0.9306098222732544,
"logps/rejected": -1.065213918685913,
"loss": 1.1987,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8612196445465088,
"rewards/margins": 0.26920828223228455,
"rewards/rejected": -2.130427837371826,
"step": 2540
},
{
"epoch": 1.8371757925072045,
"grad_norm": 21.09479586621852,
"learning_rate": 1.9603505843948214e-08,
"logits/chosen": -2.017627239227295,
"logits/rejected": -2.0076773166656494,
"logps/chosen": -0.9474620819091797,
"logps/rejected": -1.1191534996032715,
"loss": 1.1394,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8949241638183594,
"rewards/margins": 0.3433830738067627,
"rewards/rejected": -2.238306999206543,
"step": 2550
},
{
"epoch": 1.8443804034582132,
"grad_norm": 20.423562577336806,
"learning_rate": 1.9399032341961886e-08,
"logits/chosen": -1.9809995889663696,
"logits/rejected": -1.965026617050171,
"logps/chosen": -0.9898856282234192,
"logps/rejected": -1.0629730224609375,
"loss": 1.2699,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -1.9797712564468384,
"rewards/margins": 0.14617487788200378,
"rewards/rejected": -2.125946044921875,
"step": 2560
},
{
"epoch": 1.8515850144092219,
"grad_norm": 26.09646394087967,
"learning_rate": 1.9194952564874323e-08,
"logits/chosen": -2.0236237049102783,
"logits/rejected": -2.0175366401672363,
"logps/chosen": -1.0653067827224731,
"logps/rejected": -1.2080482244491577,
"loss": 1.1687,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.1306135654449463,
"rewards/margins": 0.2854826748371124,
"rewards/rejected": -2.4160964488983154,
"step": 2570
},
{
"epoch": 1.8587896253602305,
"grad_norm": 20.54864054770584,
"learning_rate": 1.8991280858651157e-08,
"logits/chosen": -1.9798238277435303,
"logits/rejected": -1.9740060567855835,
"logps/chosen": -1.0638706684112549,
"logps/rejected": -1.1493985652923584,
"loss": 1.2503,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1277413368225098,
"rewards/margins": 0.17105570435523987,
"rewards/rejected": -2.298797130584717,
"step": 2580
},
{
"epoch": 1.8659942363112392,
"grad_norm": 16.856180614276347,
"learning_rate": 1.8788031540572327e-08,
"logits/chosen": -1.9806411266326904,
"logits/rejected": -1.97232985496521,
"logps/chosen": -0.9993877410888672,
"logps/rejected": -1.1446069478988647,
"loss": 1.1723,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9987754821777344,
"rewards/margins": 0.2904384136199951,
"rewards/rejected": -2.2892138957977295,
"step": 2590
},
{
"epoch": 1.8731988472622478,
"grad_norm": 17.03483106236852,
"learning_rate": 1.858521889822565e-08,
"logits/chosen": -1.99444580078125,
"logits/rejected": -1.9968360662460327,
"logps/chosen": -0.9734609723091125,
"logps/rejected": -1.0829355716705322,
"loss": 1.2235,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.946921944618225,
"rewards/margins": 0.21894919872283936,
"rewards/rejected": -2.1658711433410645,
"step": 2600
},
{
"epoch": 1.8804034582132565,
"grad_norm": 16.460944487260154,
"learning_rate": 1.8382857188502422e-08,
"logits/chosen": -1.987308144569397,
"logits/rejected": -1.9824047088623047,
"logps/chosen": -0.9851255416870117,
"logps/rejected": -1.1126196384429932,
"loss": 1.1823,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.9702510833740234,
"rewards/margins": 0.2549881935119629,
"rewards/rejected": -2.2252392768859863,
"step": 2610
},
{
"epoch": 1.8876080691642652,
"grad_norm": 22.305541216871692,
"learning_rate": 1.8180960636595234e-08,
"logits/chosen": -1.9680871963500977,
"logits/rejected": -1.9659591913223267,
"logps/chosen": -1.0361220836639404,
"logps/rejected": -1.1791760921478271,
"loss": 1.1801,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.072244167327881,
"rewards/margins": 0.2861078977584839,
"rewards/rejected": -2.3583521842956543,
"step": 2620
},
{
"epoch": 1.8948126801152738,
"grad_norm": 20.388566247239506,
"learning_rate": 1.7979543434998015e-08,
"logits/chosen": -2.036707639694214,
"logits/rejected": -2.041584014892578,
"logps/chosen": -1.1235167980194092,
"logps/rejected": -1.2117664813995361,
"loss": 1.238,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2470335960388184,
"rewards/margins": 0.17649903893470764,
"rewards/rejected": -2.4235329627990723,
"step": 2630
},
{
"epoch": 1.9020172910662825,
"grad_norm": 26.051372198929382,
"learning_rate": 1.7778619742508345e-08,
"logits/chosen": -2.0007290840148926,
"logits/rejected": -1.9938418865203857,
"logps/chosen": -1.0931371450424194,
"logps/rejected": -1.1865342855453491,
"loss": 1.2551,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.186274290084839,
"rewards/margins": 0.186794251203537,
"rewards/rejected": -2.3730685710906982,
"step": 2640
},
{
"epoch": 1.9092219020172911,
"grad_norm": 23.4369600871899,
"learning_rate": 1.757820368323213e-08,
"logits/chosen": -1.9935007095336914,
"logits/rejected": -1.9837433099746704,
"logps/chosen": -1.1060357093811035,
"logps/rejected": -1.265039324760437,
"loss": 1.1608,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.212071418762207,
"rewards/margins": 0.31800705194473267,
"rewards/rejected": -2.530078649520874,
"step": 2650
},
{
"epoch": 1.9164265129682998,
"grad_norm": 22.386136161896175,
"learning_rate": 1.7378309345590803e-08,
"logits/chosen": -2.011643409729004,
"logits/rejected": -2.0210494995117188,
"logps/chosen": -1.0864661931991577,
"logps/rejected": -1.2283092737197876,
"loss": 1.1822,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1729323863983154,
"rewards/margins": 0.28368598222732544,
"rewards/rejected": -2.456618547439575,
"step": 2660
},
{
"epoch": 1.9236311239193085,
"grad_norm": 20.06093474919507,
"learning_rate": 1.717895078133088e-08,
"logits/chosen": -2.058295726776123,
"logits/rejected": -2.054426670074463,
"logps/chosen": -1.0596580505371094,
"logps/rejected": -1.2001304626464844,
"loss": 1.1836,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1193161010742188,
"rewards/margins": 0.2809443771839142,
"rewards/rejected": -2.4002609252929688,
"step": 2670
},
{
"epoch": 1.9308357348703171,
"grad_norm": 21.13851274050523,
"learning_rate": 1.698014200453624e-08,
"logits/chosen": -2.0123298168182373,
"logits/rejected": -2.0198843479156494,
"logps/chosen": -1.0314449071884155,
"logps/rejected": -1.162095308303833,
"loss": 1.1778,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.062889814376831,
"rewards/margins": 0.26130083203315735,
"rewards/rejected": -2.324190616607666,
"step": 2680
},
{
"epoch": 1.9380403458213258,
"grad_norm": 24.618506838094923,
"learning_rate": 1.6781896990642964e-08,
"logits/chosen": -1.9457242488861084,
"logits/rejected": -1.9430221319198608,
"logps/chosen": -1.1473594903945923,
"logps/rejected": -1.237646460533142,
"loss": 1.2441,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.2947189807891846,
"rewards/margins": 0.18057429790496826,
"rewards/rejected": -2.475292921066284,
"step": 2690
},
{
"epoch": 1.9452449567723344,
"grad_norm": 24.327220053141147,
"learning_rate": 1.658422967545693e-08,
"logits/chosen": -2.047414541244507,
"logits/rejected": -2.0341315269470215,
"logps/chosen": -1.0049512386322021,
"logps/rejected": -1.1183750629425049,
"loss": 1.2203,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -2.0099024772644043,
"rewards/margins": 0.22684772312641144,
"rewards/rejected": -2.2367501258850098,
"step": 2700
},
{
"epoch": 1.952449567723343,
"grad_norm": 20.72145176972204,
"learning_rate": 1.638715395417418e-08,
"logits/chosen": -2.023325204849243,
"logits/rejected": -2.0211892127990723,
"logps/chosen": -1.068524956703186,
"logps/rejected": -1.2049812078475952,
"loss": 1.1841,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.137049913406372,
"rewards/margins": 0.2729126214981079,
"rewards/rejected": -2.4099624156951904,
"step": 2710
},
{
"epoch": 1.9596541786743515,
"grad_norm": 22.430027131821607,
"learning_rate": 1.619068368040416e-08,
"logits/chosen": -2.0218818187713623,
"logits/rejected": -2.0176615715026855,
"logps/chosen": -1.0006544589996338,
"logps/rejected": -1.178510069847107,
"loss": 1.1299,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0013089179992676,
"rewards/margins": 0.3557109236717224,
"rewards/rejected": -2.357020139694214,
"step": 2720
},
{
"epoch": 1.9668587896253602,
"grad_norm": 17.44373356290383,
"learning_rate": 1.5994832665195853e-08,
"logits/chosen": -1.9683250188827515,
"logits/rejected": -1.9688348770141602,
"logps/chosen": -1.0345633029937744,
"logps/rejected": -1.1477251052856445,
"loss": 1.2115,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.069126605987549,
"rewards/margins": 0.22632364928722382,
"rewards/rejected": -2.295450210571289,
"step": 2730
},
{
"epoch": 1.9740634005763689,
"grad_norm": 20.229969474249543,
"learning_rate": 1.5799614676066906e-08,
"logits/chosen": -2.069178819656372,
"logits/rejected": -2.0663199424743652,
"logps/chosen": -0.9492548108100891,
"logps/rejected": -1.086529016494751,
"loss": 1.1756,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8985096216201782,
"rewards/margins": 0.2745482623577118,
"rewards/rejected": -2.173058032989502,
"step": 2740
},
{
"epoch": 1.9812680115273775,
"grad_norm": 15.993803030514755,
"learning_rate": 1.560504343603587e-08,
"logits/chosen": -1.9762630462646484,
"logits/rejected": -1.9768295288085938,
"logps/chosen": -1.0684893131256104,
"logps/rejected": -1.2240431308746338,
"loss": 1.1605,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.1369786262512207,
"rewards/margins": 0.3111076056957245,
"rewards/rejected": -2.4480862617492676,
"step": 2750
},
{
"epoch": 1.9884726224783862,
"grad_norm": 18.752561218885347,
"learning_rate": 1.541113262265748e-08,
"logits/chosen": -2.069488286972046,
"logits/rejected": -2.0672833919525146,
"logps/chosen": -1.0279682874679565,
"logps/rejected": -1.1458810567855835,
"loss": 1.2067,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.055936574935913,
"rewards/margins": 0.23582550883293152,
"rewards/rejected": -2.291762113571167,
"step": 2760
},
{
"epoch": 1.9956772334293948,
"grad_norm": 25.86621954412604,
"learning_rate": 1.5217895867061227e-08,
"logits/chosen": -2.0054798126220703,
"logits/rejected": -1.9995949268341064,
"logps/chosen": -1.083843469619751,
"logps/rejected": -1.1836421489715576,
"loss": 1.2456,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.167686939239502,
"rewards/margins": 0.1995975375175476,
"rewards/rejected": -2.3672842979431152,
"step": 2770
},
{
"epoch": 2.0028818443804033,
"grad_norm": 22.661974621135478,
"learning_rate": 1.5025346752993098e-08,
"logits/chosen": -1.9982630014419556,
"logits/rejected": -1.9999568462371826,
"logps/chosen": -1.072928547859192,
"logps/rejected": -1.1990954875946045,
"loss": 1.2011,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.145857095718384,
"rewards/margins": 0.2523340880870819,
"rewards/rejected": -2.398190975189209,
"step": 2780
},
{
"epoch": 2.010086455331412,
"grad_norm": 23.303370032175714,
"learning_rate": 1.4833498815860756e-08,
"logits/chosen": -2.052605390548706,
"logits/rejected": -2.0548830032348633,
"logps/chosen": -0.9998857378959656,
"logps/rejected": -1.1840670108795166,
"loss": 1.1499,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.9997714757919312,
"rewards/margins": 0.3683624267578125,
"rewards/rejected": -2.368134021759033,
"step": 2790
},
{
"epoch": 2.0172910662824206,
"grad_norm": 18.168458664217237,
"learning_rate": 1.4642365541781993e-08,
"logits/chosen": -1.9642353057861328,
"logits/rejected": -1.9557300806045532,
"logps/chosen": -1.0272754430770874,
"logps/rejected": -1.1926220655441284,
"loss": 1.1518,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.054550886154175,
"rewards/margins": 0.33069342374801636,
"rewards/rejected": -2.385244131088257,
"step": 2800
},
{
"epoch": 2.0244956772334293,
"grad_norm": 17.943557689678926,
"learning_rate": 1.4451960366636745e-08,
"logits/chosen": -2.021503448486328,
"logits/rejected": -2.032627582550049,
"logps/chosen": -1.0408049821853638,
"logps/rejected": -1.1747071743011475,
"loss": 1.1831,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0816099643707275,
"rewards/margins": 0.2678046226501465,
"rewards/rejected": -2.349414348602295,
"step": 2810
},
{
"epoch": 2.031700288184438,
"grad_norm": 19.245950456686252,
"learning_rate": 1.4262296675122592e-08,
"logits/chosen": -2.014530658721924,
"logits/rejected": -2.0107593536376953,
"logps/chosen": -1.0312591791152954,
"logps/rejected": -1.1913511753082275,
"loss": 1.1526,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.062518358230591,
"rewards/margins": 0.32018426060676575,
"rewards/rejected": -2.382702350616455,
"step": 2820
},
{
"epoch": 2.0389048991354466,
"grad_norm": 17.81271779194038,
"learning_rate": 1.407338779981389e-08,
"logits/chosen": -1.9946855306625366,
"logits/rejected": -1.9926750659942627,
"logps/chosen": -0.9136768579483032,
"logps/rejected": -1.0949757099151611,
"loss": 1.1116,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8273537158966064,
"rewards/margins": 0.3625979423522949,
"rewards/rejected": -2.1899514198303223,
"step": 2830
},
{
"epoch": 2.0461095100864553,
"grad_norm": 21.372732915554494,
"learning_rate": 1.3885247020224534e-08,
"logits/chosen": -2.0047404766082764,
"logits/rejected": -1.9999935626983643,
"logps/chosen": -1.0015006065368652,
"logps/rejected": -1.1312209367752075,
"loss": 1.1909,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0030012130737305,
"rewards/margins": 0.2594410181045532,
"rewards/rejected": -2.262441873550415,
"step": 2840
},
{
"epoch": 2.053314121037464,
"grad_norm": 17.00779028228168,
"learning_rate": 1.369788756187445e-08,
"logits/chosen": -2.0100817680358887,
"logits/rejected": -2.006643056869507,
"logps/chosen": -1.0269567966461182,
"logps/rejected": -1.1220487356185913,
"loss": 1.2348,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -2.0539135932922363,
"rewards/margins": 0.1901838779449463,
"rewards/rejected": -2.2440974712371826,
"step": 2850
},
{
"epoch": 2.0605187319884726,
"grad_norm": 18.64801286449761,
"learning_rate": 1.3511322595359925e-08,
"logits/chosen": -2.035876750946045,
"logits/rejected": -2.0276687145233154,
"logps/chosen": -0.9376466870307922,
"logps/rejected": -1.1057026386260986,
"loss": 1.1393,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.8752933740615845,
"rewards/margins": 0.3361119329929352,
"rewards/rejected": -2.2114052772521973,
"step": 2860
},
{
"epoch": 2.0677233429394812,
"grad_norm": 17.383929031131085,
"learning_rate": 1.3325565235427716e-08,
"logits/chosen": -2.0277578830718994,
"logits/rejected": -2.026214122772217,
"logps/chosen": -0.982566237449646,
"logps/rejected": -1.1270108222961426,
"loss": 1.1768,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.965132474899292,
"rewards/margins": 0.2888889014720917,
"rewards/rejected": -2.254021644592285,
"step": 2870
},
{
"epoch": 2.07492795389049,
"grad_norm": 17.047214261003482,
"learning_rate": 1.3140628540053218e-08,
"logits/chosen": -1.9946644306182861,
"logits/rejected": -1.9967546463012695,
"logps/chosen": -0.9750404357910156,
"logps/rejected": -1.1103841066360474,
"loss": 1.1831,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.9500808715820312,
"rewards/margins": 0.2706873416900635,
"rewards/rejected": -2.2207682132720947,
"step": 2880
},
{
"epoch": 2.0821325648414986,
"grad_norm": 19.26958654488348,
"learning_rate": 1.2956525509522451e-08,
"logits/chosen": -1.9811160564422607,
"logits/rejected": -1.9807733297348022,
"logps/chosen": -1.111905574798584,
"logps/rejected": -1.2153931856155396,
"loss": 1.2339,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.223811149597168,
"rewards/margins": 0.2069750279188156,
"rewards/rejected": -2.430786371231079,
"step": 2890
},
{
"epoch": 2.089337175792507,
"grad_norm": 19.78025222762576,
"learning_rate": 1.2773269085518267e-08,
"logits/chosen": -2.0117239952087402,
"logits/rejected": -2.0130703449249268,
"logps/chosen": -1.0760631561279297,
"logps/rejected": -1.206182599067688,
"loss": 1.1842,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.1521263122558594,
"rewards/margins": 0.2602389454841614,
"rewards/rejected": -2.412365198135376,
"step": 2900
},
{
"epoch": 2.096541786743516,
"grad_norm": 20.115808813369917,
"learning_rate": 1.2590872150210574e-08,
"logits/chosen": -2.06766414642334,
"logits/rejected": -2.0607457160949707,
"logps/chosen": -1.0578199625015259,
"logps/rejected": -1.1676528453826904,
"loss": 1.226,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.1156399250030518,
"rewards/margins": 0.21966581046581268,
"rewards/rejected": -2.335305690765381,
"step": 2910
},
{
"epoch": 2.1037463976945245,
"grad_norm": 20.584492670465366,
"learning_rate": 1.2409347525350775e-08,
"logits/chosen": -2.0331482887268066,
"logits/rejected": -2.02323842048645,
"logps/chosen": -1.1097338199615479,
"logps/rejected": -1.2564305067062378,
"loss": 1.1674,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.2194676399230957,
"rewards/margins": 0.2933935225009918,
"rewards/rejected": -2.5128610134124756,
"step": 2920
},
{
"epoch": 2.110951008645533,
"grad_norm": 22.815499062996494,
"learning_rate": 1.2228707971370421e-08,
"logits/chosen": -2.0209853649139404,
"logits/rejected": -2.0140042304992676,
"logps/chosen": -0.9935145378112793,
"logps/rejected": -1.105963945388794,
"loss": 1.2266,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9870290756225586,
"rewards/margins": 0.22489885985851288,
"rewards/rejected": -2.211927890777588,
"step": 2930
},
{
"epoch": 2.118155619596542,
"grad_norm": 21.166376348681847,
"learning_rate": 1.2048966186484282e-08,
"logits/chosen": -2.017411708831787,
"logits/rejected": -2.0006022453308105,
"logps/chosen": -1.1170910596847534,
"logps/rejected": -1.2319118976593018,
"loss": 1.2115,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.234182119369507,
"rewards/margins": 0.2296416014432907,
"rewards/rejected": -2.4638237953186035,
"step": 2940
},
{
"epoch": 2.1253602305475505,
"grad_norm": 28.013077396296502,
"learning_rate": 1.187013480579762e-08,
"logits/chosen": -2.0150485038757324,
"logits/rejected": -2.0178308486938477,
"logps/chosen": -1.0425622463226318,
"logps/rejected": -1.1764047145843506,
"loss": 1.2005,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.0851244926452637,
"rewards/margins": 0.2676849961280823,
"rewards/rejected": -2.352809429168701,
"step": 2950
},
{
"epoch": 2.132564841498559,
"grad_norm": 39.69497918705247,
"learning_rate": 1.1692226400418073e-08,
"logits/chosen": -1.9483245611190796,
"logits/rejected": -1.9468435049057007,
"logps/chosen": -1.0812904834747314,
"logps/rejected": -1.211723804473877,
"loss": 1.2174,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.162580966949463,
"rewards/margins": 0.2608664035797119,
"rewards/rejected": -2.423447608947754,
"step": 2960
},
{
"epoch": 2.139769452449568,
"grad_norm": 16.186051185633424,
"learning_rate": 1.1515253476571923e-08,
"logits/chosen": -1.9795656204223633,
"logits/rejected": -1.9738327264785767,
"logps/chosen": -1.0102039575576782,
"logps/rejected": -1.191947102546692,
"loss": 1.1203,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.0204079151153564,
"rewards/margins": 0.36348623037338257,
"rewards/rejected": -2.383894205093384,
"step": 2970
},
{
"epoch": 2.1469740634005765,
"grad_norm": 19.900013908570074,
"learning_rate": 1.133922847472496e-08,
"logits/chosen": -2.0021681785583496,
"logits/rejected": -2.002943515777588,
"logps/chosen": -1.1099964380264282,
"logps/rejected": -1.2084180116653442,
"loss": 1.2521,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.2199928760528564,
"rewards/margins": 0.1968432366847992,
"rewards/rejected": -2.4168360233306885,
"step": 2980
},
{
"epoch": 2.154178674351585,
"grad_norm": 22.72131090745606,
"learning_rate": 1.1164163768707952e-08,
"logits/chosen": -2.0033812522888184,
"logits/rejected": -1.9982162714004517,
"logps/chosen": -1.0048575401306152,
"logps/rejected": -1.1429532766342163,
"loss": 1.1846,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0097150802612305,
"rewards/margins": 0.2761916518211365,
"rewards/rejected": -2.2859065532684326,
"step": 2990
},
{
"epoch": 2.161383285302594,
"grad_norm": 17.87155121185044,
"learning_rate": 1.0990071664846861e-08,
"logits/chosen": -1.9833234548568726,
"logits/rejected": -1.9822227954864502,
"logps/chosen": -1.0197571516036987,
"logps/rejected": -1.198885202407837,
"loss": 1.1594,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0395143032073975,
"rewards/margins": 0.35825610160827637,
"rewards/rejected": -2.397770404815674,
"step": 3000
},
{
"epoch": 2.1685878962536025,
"grad_norm": 18.894758055694265,
"learning_rate": 1.0816964401097739e-08,
"logits/chosen": -1.9618886709213257,
"logits/rejected": -1.9587116241455078,
"logps/chosen": -0.9558318853378296,
"logps/rejected": -1.0796352624893188,
"loss": 1.205,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.9116637706756592,
"rewards/margins": 0.24760663509368896,
"rewards/rejected": -2.1592705249786377,
"step": 3010
},
{
"epoch": 2.175792507204611,
"grad_norm": 19.906085929832862,
"learning_rate": 1.0644854146186406e-08,
"logits/chosen": -2.025203227996826,
"logits/rejected": -2.0189619064331055,
"logps/chosen": -1.024359107017517,
"logps/rejected": -1.1828162670135498,
"loss": 1.1636,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.048718214035034,
"rewards/margins": 0.31691429018974304,
"rewards/rejected": -2.3656325340270996,
"step": 3020
},
{
"epoch": 2.18299711815562,
"grad_norm": 19.050052296968172,
"learning_rate": 1.0473752998753114e-08,
"logits/chosen": -2.0075266361236572,
"logits/rejected": -1.9991521835327148,
"logps/chosen": -1.0193443298339844,
"logps/rejected": -1.1795679330825806,
"loss": 1.1534,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.0386886596679688,
"rewards/margins": 0.3204469382762909,
"rewards/rejected": -2.359135866165161,
"step": 3030
},
{
"epoch": 2.1902017291066285,
"grad_norm": 19.33190356248751,
"learning_rate": 1.030367298650201e-08,
"logits/chosen": -2.0206215381622314,
"logits/rejected": -2.0204710960388184,
"logps/chosen": -1.039102554321289,
"logps/rejected": -1.191624402999878,
"loss": 1.1575,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.078205108642578,
"rewards/margins": 0.30504345893859863,
"rewards/rejected": -2.383248805999756,
"step": 3040
},
{
"epoch": 2.1974063400576367,
"grad_norm": 21.9617981131132,
"learning_rate": 1.0134626065355675e-08,
"logits/chosen": -2.074868679046631,
"logits/rejected": -2.071895122528076,
"logps/chosen": -1.0231993198394775,
"logps/rejected": -1.1662580966949463,
"loss": 1.1883,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.046398639678955,
"rewards/margins": 0.2861180305480957,
"rewards/rejected": -2.3325161933898926,
"step": 3050
},
{
"epoch": 2.2046109510086453,
"grad_norm": 19.55623294249282,
"learning_rate": 9.966624118614611e-09,
"logits/chosen": -2.013423442840576,
"logits/rejected": -2.0084660053253174,
"logps/chosen": -1.0631778240203857,
"logps/rejected": -1.2087138891220093,
"loss": 1.1876,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1263556480407715,
"rewards/margins": 0.2910720705986023,
"rewards/rejected": -2.4174277782440186,
"step": 3060
},
{
"epoch": 2.211815561959654,
"grad_norm": 14.867171520984334,
"learning_rate": 9.799678956121976e-09,
"logits/chosen": -1.970645546913147,
"logits/rejected": -1.9662506580352783,
"logps/chosen": -1.030286431312561,
"logps/rejected": -1.1386739015579224,
"loss": 1.2008,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.060572862625122,
"rewards/margins": 0.21677501499652863,
"rewards/rejected": -2.2773478031158447,
"step": 3070
},
{
"epoch": 2.2190201729106627,
"grad_norm": 23.6722369037589,
"learning_rate": 9.633802313433314e-09,
"logits/chosen": -1.942859411239624,
"logits/rejected": -1.9486020803451538,
"logps/chosen": -1.0193486213684082,
"logps/rejected": -1.1251273155212402,
"loss": 1.2058,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0386972427368164,
"rewards/margins": 0.21155771613121033,
"rewards/rejected": -2.2502546310424805,
"step": 3080
},
{
"epoch": 2.2262247838616713,
"grad_norm": 20.75677230247985,
"learning_rate": 9.469005850991705e-09,
"logits/chosen": -2.0128586292266846,
"logits/rejected": -2.0072181224823,
"logps/chosen": -1.0143338441848755,
"logps/rejected": -1.1310632228851318,
"loss": 1.2348,
"rewards/accuracies": 0.53125,
"rewards/chosen": -2.028667688369751,
"rewards/margins": 0.23345866799354553,
"rewards/rejected": -2.2621264457702637,
"step": 3090
},
{
"epoch": 2.23342939481268,
"grad_norm": 18.746796539063972,
"learning_rate": 9.305301153307949e-09,
"logits/chosen": -2.0094423294067383,
"logits/rejected": -2.017090320587158,
"logps/chosen": -0.9450882077217102,
"logps/rejected": -1.1097722053527832,
"loss": 1.157,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.8901764154434204,
"rewards/margins": 0.32936811447143555,
"rewards/rejected": -2.2195444107055664,
"step": 3100
},
{
"epoch": 2.2406340057636887,
"grad_norm": 18.096061545612795,
"learning_rate": 9.142699728146336e-09,
"logits/chosen": -1.9791491031646729,
"logits/rejected": -1.9722025394439697,
"logps/chosen": -1.0312968492507935,
"logps/rejected": -1.163287878036499,
"loss": 1.2018,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.062593698501587,
"rewards/margins": 0.26398202776908875,
"rewards/rejected": -2.326575756072998,
"step": 3110
},
{
"epoch": 2.2478386167146973,
"grad_norm": 16.703535569291137,
"learning_rate": 8.981213005715627e-09,
"logits/chosen": -2.0036609172821045,
"logits/rejected": -2.006706714630127,
"logps/chosen": -0.9921053647994995,
"logps/rejected": -1.1647652387619019,
"loss": 1.1486,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.984210729598999,
"rewards/margins": 0.34531962871551514,
"rewards/rejected": -2.3295304775238037,
"step": 3120
},
{
"epoch": 2.255043227665706,
"grad_norm": 21.958272583090416,
"learning_rate": 8.820852337865611e-09,
"logits/chosen": -2.0320816040039062,
"logits/rejected": -2.028298854827881,
"logps/chosen": -0.9958044290542603,
"logps/rejected": -1.1435072422027588,
"loss": 1.1727,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9916088581085205,
"rewards/margins": 0.29540568590164185,
"rewards/rejected": -2.2870144844055176,
"step": 3130
},
{
"epoch": 2.2622478386167146,
"grad_norm": 17.08140631103118,
"learning_rate": 8.661628997289044e-09,
"logits/chosen": -1.974020004272461,
"logits/rejected": -1.9698307514190674,
"logps/chosen": -1.0156090259552002,
"logps/rejected": -1.1709401607513428,
"loss": 1.169,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.0312180519104004,
"rewards/margins": 0.31066226959228516,
"rewards/rejected": -2.3418803215026855,
"step": 3140
},
{
"epoch": 2.2694524495677233,
"grad_norm": 16.417561627034264,
"learning_rate": 8.503554176729341e-09,
"logits/chosen": -1.9732913970947266,
"logits/rejected": -1.9715712070465088,
"logps/chosen": -1.0267970561981201,
"logps/rejected": -1.1859861612319946,
"loss": 1.1702,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0535941123962402,
"rewards/margins": 0.3183782696723938,
"rewards/rejected": -2.3719723224639893,
"step": 3150
},
{
"epoch": 2.276657060518732,
"grad_norm": 24.932055477370433,
"learning_rate": 8.346638988193636e-09,
"logits/chosen": -1.9996095895767212,
"logits/rejected": -1.9946562051773071,
"logps/chosen": -0.9252532720565796,
"logps/rejected": -1.0754306316375732,
"loss": 1.1774,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.8505065441131592,
"rewards/margins": 0.30035486817359924,
"rewards/rejected": -2.1508612632751465,
"step": 3160
},
{
"epoch": 2.2838616714697406,
"grad_norm": 23.217906478731962,
"learning_rate": 8.19089446217176e-09,
"logits/chosen": -1.9767364263534546,
"logits/rejected": -1.9664764404296875,
"logps/chosen": -1.002582311630249,
"logps/rejected": -1.1916791200637817,
"loss": 1.121,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.005164623260498,
"rewards/margins": 0.378193199634552,
"rewards/rejected": -2.3833582401275635,
"step": 3170
},
{
"epoch": 2.2910662824207493,
"grad_norm": 17.141991975793108,
"learning_rate": 8.036331546860777e-09,
"logits/chosen": -1.9834896326065063,
"logits/rejected": -1.9829978942871094,
"logps/chosen": -0.950110912322998,
"logps/rejected": -1.0394455194473267,
"loss": 1.2456,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.900221824645996,
"rewards/margins": 0.17866934835910797,
"rewards/rejected": -2.0788910388946533,
"step": 3180
},
{
"epoch": 2.298270893371758,
"grad_norm": 23.516146286428995,
"learning_rate": 7.882961107395416e-09,
"logits/chosen": -1.9969555139541626,
"logits/rejected": -1.9910898208618164,
"logps/chosen": -1.130173683166504,
"logps/rejected": -1.1781437397003174,
"loss": 1.3138,
"rewards/accuracies": 0.5,
"rewards/chosen": -2.260347366333008,
"rewards/margins": 0.09594009816646576,
"rewards/rejected": -2.3562874794006348,
"step": 3190
},
{
"epoch": 2.3054755043227666,
"grad_norm": 25.884981502834442,
"learning_rate": 7.73079392508428e-09,
"logits/chosen": -1.966968297958374,
"logits/rejected": -1.9663887023925781,
"logps/chosen": -1.0911871194839478,
"logps/rejected": -1.2788745164871216,
"loss": 1.1537,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1823742389678955,
"rewards/margins": 0.3753744959831238,
"rewards/rejected": -2.557749032974243,
"step": 3200
},
{
"epoch": 2.3126801152737753,
"grad_norm": 21.445267273811133,
"learning_rate": 7.579840696651938e-09,
"logits/chosen": -1.9995644092559814,
"logits/rejected": -1.9965426921844482,
"logps/chosen": -1.0473155975341797,
"logps/rejected": -1.171209692955017,
"loss": 1.2093,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0946311950683594,
"rewards/margins": 0.2477881908416748,
"rewards/rejected": -2.342419385910034,
"step": 3210
},
{
"epoch": 2.319884726224784,
"grad_norm": 21.167143894724592,
"learning_rate": 7.43011203348704e-09,
"logits/chosen": -1.9143447875976562,
"logits/rejected": -1.911238431930542,
"logps/chosen": -1.0506112575531006,
"logps/rejected": -1.1263036727905273,
"loss": 1.2681,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.101222515106201,
"rewards/margins": 0.15138480067253113,
"rewards/rejected": -2.2526073455810547,
"step": 3220
},
{
"epoch": 2.3270893371757926,
"grad_norm": 18.7122492381894,
"learning_rate": 7.281618460896344e-09,
"logits/chosen": -1.994127869606018,
"logits/rejected": -1.9916470050811768,
"logps/chosen": -0.9652446508407593,
"logps/rejected": -1.1070702075958252,
"loss": 1.1731,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9304893016815186,
"rewards/margins": 0.2836512625217438,
"rewards/rejected": -2.2141404151916504,
"step": 3230
},
{
"epoch": 2.3342939481268012,
"grad_norm": 20.398678891005122,
"learning_rate": 7.134370417364849e-09,
"logits/chosen": -1.9642471075057983,
"logits/rejected": -1.9637800455093384,
"logps/chosen": -1.0007370710372925,
"logps/rejected": -1.1398149728775024,
"loss": 1.2023,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.001474142074585,
"rewards/margins": 0.27815574407577515,
"rewards/rejected": -2.279629945755005,
"step": 3240
},
{
"epoch": 2.34149855907781,
"grad_norm": 23.240945165202064,
"learning_rate": 6.988378253821981e-09,
"logits/chosen": -1.9668891429901123,
"logits/rejected": -1.9658010005950928,
"logps/chosen": -1.0259102582931519,
"logps/rejected": -1.1435902118682861,
"loss": 1.2094,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0518205165863037,
"rewards/margins": 0.2353595793247223,
"rewards/rejected": -2.2871804237365723,
"step": 3250
},
{
"epoch": 2.3487031700288186,
"grad_norm": 20.274147530632312,
"learning_rate": 6.8436522329140186e-09,
"logits/chosen": -1.9758933782577515,
"logits/rejected": -1.9824403524398804,
"logps/chosen": -1.0337318181991577,
"logps/rejected": -1.1588385105133057,
"loss": 1.2104,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.0674636363983154,
"rewards/margins": 0.25021329522132874,
"rewards/rejected": -2.3176770210266113,
"step": 3260
},
{
"epoch": 2.3559077809798272,
"grad_norm": 21.874255456099494,
"learning_rate": 6.700202528282603e-09,
"logits/chosen": -1.977266550064087,
"logits/rejected": -1.9675136804580688,
"logps/chosen": -1.0283175706863403,
"logps/rejected": -1.1439546346664429,
"loss": 1.2152,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0566351413726807,
"rewards/margins": 0.2312743216753006,
"rewards/rejected": -2.2879092693328857,
"step": 3270
},
{
"epoch": 2.363112391930836,
"grad_norm": 21.384973629502426,
"learning_rate": 6.558039223849668e-09,
"logits/chosen": -2.0306007862091064,
"logits/rejected": -2.0210115909576416,
"logps/chosen": -1.036071538925171,
"logps/rejected": -1.2443821430206299,
"loss": 1.1148,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -2.072143077850342,
"rewards/margins": 0.41662105917930603,
"rewards/rejected": -2.4887642860412598,
"step": 3280
},
{
"epoch": 2.3703170028818445,
"grad_norm": 22.082670294571745,
"learning_rate": 6.417172313108471e-09,
"logits/chosen": -1.9587681293487549,
"logits/rejected": -1.9533681869506836,
"logps/chosen": -0.9850085973739624,
"logps/rejected": -1.1136410236358643,
"loss": 1.1988,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.9700171947479248,
"rewards/margins": 0.2572648227214813,
"rewards/rejected": -2.2272820472717285,
"step": 3290
},
{
"epoch": 2.377521613832853,
"grad_norm": 21.566072441690903,
"learning_rate": 6.277611698421179e-09,
"logits/chosen": -2.0187790393829346,
"logits/rejected": -2.010676383972168,
"logps/chosen": -0.9041236042976379,
"logps/rejected": -1.0975037813186646,
"loss": 1.1246,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.8082472085952759,
"rewards/margins": 0.3867604732513428,
"rewards/rejected": -2.195007562637329,
"step": 3300
},
{
"epoch": 2.3847262247838614,
"grad_norm": 22.686014286093737,
"learning_rate": 6.139367190322714e-09,
"logits/chosen": -2.0076019763946533,
"logits/rejected": -2.0073728561401367,
"logps/chosen": -1.0593101978302002,
"logps/rejected": -1.2181084156036377,
"loss": 1.1614,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.1186203956604004,
"rewards/margins": 0.3175966143608093,
"rewards/rejected": -2.4362168312072754,
"step": 3310
},
{
"epoch": 2.39193083573487,
"grad_norm": 17.411733324996547,
"learning_rate": 6.002448506831171e-09,
"logits/chosen": -2.00325083732605,
"logits/rejected": -1.9984287023544312,
"logps/chosen": -0.9814065098762512,
"logps/rejected": -1.124678134918213,
"loss": 1.1735,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9628130197525024,
"rewards/margins": 0.2865433394908905,
"rewards/rejected": -2.249356269836426,
"step": 3320
},
{
"epoch": 2.3991354466858787,
"grad_norm": 18.206116423091125,
"learning_rate": 5.866865272764607e-09,
"logits/chosen": -2.0245230197906494,
"logits/rejected": -2.02435040473938,
"logps/chosen": -1.016035795211792,
"logps/rejected": -1.1609398126602173,
"loss": 1.1763,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.032071590423584,
"rewards/margins": 0.2898081839084625,
"rewards/rejected": -2.3218796253204346,
"step": 3330
},
{
"epoch": 2.4063400576368874,
"grad_norm": 23.272555194691343,
"learning_rate": 5.7326270190645595e-09,
"logits/chosen": -1.900092363357544,
"logits/rejected": -1.901489496231079,
"logps/chosen": -1.0590754747390747,
"logps/rejected": -1.1687304973602295,
"loss": 1.2178,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.1181509494781494,
"rewards/margins": 0.2193101942539215,
"rewards/rejected": -2.337460994720459,
"step": 3340
},
{
"epoch": 2.413544668587896,
"grad_norm": 18.379780103310328,
"learning_rate": 5.599743182125938e-09,
"logits/chosen": -2.0489888191223145,
"logits/rejected": -2.048907518386841,
"logps/chosen": -1.04793381690979,
"logps/rejected": -1.1847532987594604,
"loss": 1.1791,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.09586763381958,
"rewards/margins": 0.2736392021179199,
"rewards/rejected": -2.369506597518921,
"step": 3350
},
{
"epoch": 2.4207492795389047,
"grad_norm": 20.327364402710476,
"learning_rate": 5.46822310313379e-09,
"logits/chosen": -2.0488951206207275,
"logits/rejected": -2.05851411819458,
"logps/chosen": -1.0903593301773071,
"logps/rejected": -1.1954041719436646,
"loss": 1.2347,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1807186603546143,
"rewards/margins": 0.21008984744548798,
"rewards/rejected": -2.390808343887329,
"step": 3360
},
{
"epoch": 2.4279538904899134,
"grad_norm": 20.74526034057685,
"learning_rate": 5.33807602740658e-09,
"logits/chosen": -2.0205600261688232,
"logits/rejected": -2.0137457847595215,
"logps/chosen": -0.9561742544174194,
"logps/rejected": -1.1597832441329956,
"loss": 1.1117,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.9123485088348389,
"rewards/margins": 0.4072180390357971,
"rewards/rejected": -2.319566488265991,
"step": 3370
},
{
"epoch": 2.435158501440922,
"grad_norm": 21.143223128559057,
"learning_rate": 5.209311103746334e-09,
"logits/chosen": -2.000640869140625,
"logits/rejected": -2.0010857582092285,
"logps/chosen": -1.0521432161331177,
"logps/rejected": -1.224646806716919,
"loss": 1.1583,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.1042864322662354,
"rewards/margins": 0.34500715136528015,
"rewards/rejected": -2.449293613433838,
"step": 3380
},
{
"epoch": 2.4423631123919307,
"grad_norm": 24.24368097300932,
"learning_rate": 5.081937383795484e-09,
"logits/chosen": -1.9737918376922607,
"logits/rejected": -1.9732027053833008,
"logps/chosen": -0.9712947010993958,
"logps/rejected": -1.1367398500442505,
"loss": 1.1475,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9425894021987915,
"rewards/margins": 0.3308902680873871,
"rewards/rejected": -2.273479700088501,
"step": 3390
},
{
"epoch": 2.4495677233429394,
"grad_norm": 18.631728815748932,
"learning_rate": 4.955963821400599e-09,
"logits/chosen": -2.028813123703003,
"logits/rejected": -2.0230822563171387,
"logps/chosen": -1.0294839143753052,
"logps/rejected": -1.166856288909912,
"loss": 1.1931,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.0589678287506104,
"rewards/margins": 0.2747448980808258,
"rewards/rejected": -2.333712577819824,
"step": 3400
},
{
"epoch": 2.456772334293948,
"grad_norm": 15.405415975081304,
"learning_rate": 4.831399271982928e-09,
"logits/chosen": -1.9567426443099976,
"logits/rejected": -1.9487731456756592,
"logps/chosen": -1.0431255102157593,
"logps/rejected": -1.1727097034454346,
"loss": 1.211,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0862510204315186,
"rewards/margins": 0.2591683268547058,
"rewards/rejected": -2.345419406890869,
"step": 3410
},
{
"epoch": 2.4639769452449567,
"grad_norm": 24.975880962434086,
"learning_rate": 4.708252491915951e-09,
"logits/chosen": -2.0264954566955566,
"logits/rejected": -2.0203347206115723,
"logps/chosen": -1.0453736782073975,
"logps/rejected": -1.1925294399261475,
"loss": 1.1967,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.090747356414795,
"rewards/margins": 0.29431161284446716,
"rewards/rejected": -2.385058879852295,
"step": 3420
},
{
"epoch": 2.4711815561959654,
"grad_norm": 25.25311706276655,
"learning_rate": 4.58653213790981e-09,
"logits/chosen": -2.009765863418579,
"logits/rejected": -2.0017480850219727,
"logps/chosen": -1.0253998041152954,
"logps/rejected": -1.1738344430923462,
"loss": 1.1794,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.050799608230591,
"rewards/margins": 0.2968693673610687,
"rewards/rejected": -2.3476688861846924,
"step": 3430
},
{
"epoch": 2.478386167146974,
"grad_norm": 18.143302706074053,
"learning_rate": 4.466246766402773e-09,
"logits/chosen": -1.989457130432129,
"logits/rejected": -1.9831485748291016,
"logps/chosen": -1.039151906967163,
"logps/rejected": -1.1928648948669434,
"loss": 1.1832,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.078303813934326,
"rewards/margins": 0.307425856590271,
"rewards/rejected": -2.3857297897338867,
"step": 3440
},
{
"epoch": 2.4855907780979827,
"grad_norm": 22.18491810055598,
"learning_rate": 4.347404832959775e-09,
"logits/chosen": -2.036830425262451,
"logits/rejected": -2.0370731353759766,
"logps/chosen": -1.0329066514968872,
"logps/rejected": -1.192126989364624,
"loss": 1.1623,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.0658133029937744,
"rewards/margins": 0.31844058632850647,
"rewards/rejected": -2.384253978729248,
"step": 3450
},
{
"epoch": 2.4927953890489913,
"grad_norm": 33.05603885847282,
"learning_rate": 4.230014691678016e-09,
"logits/chosen": -1.9939508438110352,
"logits/rejected": -1.9945671558380127,
"logps/chosen": -1.0595440864562988,
"logps/rejected": -1.1260967254638672,
"loss": 1.2725,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1190881729125977,
"rewards/margins": 0.1331052929162979,
"rewards/rejected": -2.2521934509277344,
"step": 3460
},
{
"epoch": 2.5,
"grad_norm": 17.95056296756793,
"learning_rate": 4.114084594599707e-09,
"logits/chosen": -1.9955661296844482,
"logits/rejected": -1.9954122304916382,
"logps/chosen": -1.0110971927642822,
"logps/rejected": -1.2275745868682861,
"loss": 1.1027,
"rewards/accuracies": 0.65625,
"rewards/chosen": -2.0221943855285645,
"rewards/margins": 0.4329546391963959,
"rewards/rejected": -2.4551491737365723,
"step": 3470
},
{
"epoch": 2.5072046109510087,
"grad_norm": 22.46057751431382,
"learning_rate": 3.9996226911319546e-09,
"logits/chosen": -1.9920648336410522,
"logits/rejected": -1.97979736328125,
"logps/chosen": -1.015860915184021,
"logps/rejected": -1.1448417901992798,
"loss": 1.1908,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.031721830368042,
"rewards/margins": 0.25796186923980713,
"rewards/rejected": -2.2896835803985596,
"step": 3480
},
{
"epoch": 2.5144092219020173,
"grad_norm": 17.84685371740395,
"learning_rate": 3.886637027473949e-09,
"logits/chosen": -2.003864049911499,
"logits/rejected": -2.0060501098632812,
"logps/chosen": -1.0762816667556763,
"logps/rejected": -1.239137887954712,
"loss": 1.1561,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.1525633335113525,
"rewards/margins": 0.32571229338645935,
"rewards/rejected": -2.478275775909424,
"step": 3490
},
{
"epoch": 2.521613832853026,
"grad_norm": 19.29855967109395,
"learning_rate": 3.775135546051295e-09,
"logits/chosen": -1.9411065578460693,
"logits/rejected": -1.9420562982559204,
"logps/chosen": -1.0252829790115356,
"logps/rejected": -1.1507127285003662,
"loss": 1.1987,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0505659580230713,
"rewards/margins": 0.250859797000885,
"rewards/rejected": -2.3014254570007324,
"step": 3500
},
{
"epoch": 2.5288184438040346,
"grad_norm": 23.2883981469754,
"learning_rate": 3.665126084957723e-09,
"logits/chosen": -1.9881235361099243,
"logits/rejected": -1.9923149347305298,
"logps/chosen": -1.1336690187454224,
"logps/rejected": -1.2319626808166504,
"loss": 1.2595,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.2673380374908447,
"rewards/margins": 0.19658716022968292,
"rewards/rejected": -2.463925361633301,
"step": 3510
},
{
"epoch": 2.5360230547550433,
"grad_norm": 19.49687492971959,
"learning_rate": 3.556616377404101e-09,
"logits/chosen": -2.005202531814575,
"logits/rejected": -2.002882719039917,
"logps/chosen": -1.078424096107483,
"logps/rejected": -1.2361047267913818,
"loss": 1.1542,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.156848192214966,
"rewards/margins": 0.3153611719608307,
"rewards/rejected": -2.4722094535827637,
"step": 3520
},
{
"epoch": 2.543227665706052,
"grad_norm": 19.79099303892538,
"learning_rate": 3.4496140511748125e-09,
"logits/chosen": -1.9998537302017212,
"logits/rejected": -1.9945876598358154,
"logps/chosen": -1.054868459701538,
"logps/rejected": -1.198381781578064,
"loss": 1.176,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.109736919403076,
"rewards/margins": 0.2870263457298279,
"rewards/rejected": -2.396763563156128,
"step": 3530
},
{
"epoch": 2.5504322766570606,
"grad_norm": 31.21393810269857,
"learning_rate": 3.3441266280915427e-09,
"logits/chosen": -1.9868720769882202,
"logits/rejected": -1.9875354766845703,
"logps/chosen": -1.0944864749908447,
"logps/rejected": -1.2088205814361572,
"loss": 1.2139,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.1889729499816895,
"rewards/margins": 0.2286679744720459,
"rewards/rejected": -2.4176411628723145,
"step": 3540
},
{
"epoch": 2.5576368876080693,
"grad_norm": 23.58461815941298,
"learning_rate": 3.2401615234845693e-09,
"logits/chosen": -2.00813627243042,
"logits/rejected": -2.0022683143615723,
"logps/chosen": -1.092397928237915,
"logps/rejected": -1.235439658164978,
"loss": 1.1899,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.18479585647583,
"rewards/margins": 0.28608375787734985,
"rewards/rejected": -2.470879316329956,
"step": 3550
},
{
"epoch": 2.564841498559078,
"grad_norm": 16.081046688546003,
"learning_rate": 3.1377260456714375e-09,
"logits/chosen": -1.8945989608764648,
"logits/rejected": -1.8861172199249268,
"logps/chosen": -1.060025930404663,
"logps/rejected": -1.2014561891555786,
"loss": 1.1701,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.120051860809326,
"rewards/margins": 0.2828606069087982,
"rewards/rejected": -2.4029123783111572,
"step": 3560
},
{
"epoch": 2.5720461095100866,
"grad_norm": 17.90788569435218,
"learning_rate": 3.0368273954432698e-09,
"logits/chosen": -2.0311574935913086,
"logits/rejected": -2.022732973098755,
"logps/chosen": -1.0490517616271973,
"logps/rejected": -1.1529314517974854,
"loss": 1.2253,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.0981035232543945,
"rewards/margins": 0.20775911211967468,
"rewards/rejected": -2.3058629035949707,
"step": 3570
},
{
"epoch": 2.5792507204610953,
"grad_norm": 17.350027926948087,
"learning_rate": 2.937472665558541e-09,
"logits/chosen": -2.0183825492858887,
"logits/rejected": -2.019484043121338,
"logps/chosen": -1.0362021923065186,
"logps/rejected": -1.1471149921417236,
"loss": 1.2278,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.072404384613037,
"rewards/margins": 0.22182568907737732,
"rewards/rejected": -2.2942299842834473,
"step": 3580
},
{
"epoch": 2.586455331412104,
"grad_norm": 21.819876934855408,
"learning_rate": 2.8396688402445053e-09,
"logits/chosen": -2.0643913745880127,
"logits/rejected": -2.0568366050720215,
"logps/chosen": -1.0093636512756348,
"logps/rejected": -1.2179429531097412,
"loss": 1.1053,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0187273025512695,
"rewards/margins": 0.41715869307518005,
"rewards/rejected": -2.4358859062194824,
"step": 3590
},
{
"epoch": 2.5936599423631126,
"grad_norm": 23.978636486056804,
"learning_rate": 2.7434227947062324e-09,
"logits/chosen": -2.0086283683776855,
"logits/rejected": -2.002335548400879,
"logps/chosen": -1.1310678720474243,
"logps/rejected": -1.239127516746521,
"loss": 1.2324,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.2621357440948486,
"rewards/margins": 0.2161194384098053,
"rewards/rejected": -2.478255033493042,
"step": 3600
},
{
"epoch": 2.6008645533141213,
"grad_norm": 18.242440029630476,
"learning_rate": 2.6487412946432976e-09,
"logits/chosen": -1.9712591171264648,
"logits/rejected": -1.966138482093811,
"logps/chosen": -1.0688244104385376,
"logps/rejected": -1.2041301727294922,
"loss": 1.1924,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.137648820877075,
"rewards/margins": 0.27061182260513306,
"rewards/rejected": -2.4082603454589844,
"step": 3610
},
{
"epoch": 2.60806916426513,
"grad_norm": 22.81379926161724,
"learning_rate": 2.5556309957742024e-09,
"logits/chosen": -1.9811557531356812,
"logits/rejected": -1.9759891033172607,
"logps/chosen": -1.0247745513916016,
"logps/rejected": -1.220990538597107,
"loss": 1.1157,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.049549102783203,
"rewards/margins": 0.39243215322494507,
"rewards/rejected": -2.441981077194214,
"step": 3620
},
{
"epoch": 2.6152737752161386,
"grad_norm": 22.699037535274126,
"learning_rate": 2.4640984433684758e-09,
"logits/chosen": -2.03954815864563,
"logits/rejected": -2.040444850921631,
"logps/chosen": -1.1181697845458984,
"logps/rejected": -1.234116554260254,
"loss": 1.2351,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.236339569091797,
"rewards/margins": 0.231893390417099,
"rewards/rejected": -2.468233108520508,
"step": 3630
},
{
"epoch": 2.6224783861671472,
"grad_norm": 16.94171934712139,
"learning_rate": 2.3741500717865987e-09,
"logits/chosen": -1.995910882949829,
"logits/rejected": -2.00685715675354,
"logps/chosen": -1.0068349838256836,
"logps/rejected": -1.1508421897888184,
"loss": 1.1796,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.013669967651367,
"rewards/margins": 0.2880145311355591,
"rewards/rejected": -2.3016843795776367,
"step": 3640
},
{
"epoch": 2.629682997118156,
"grad_norm": 17.359736896500603,
"learning_rate": 2.285792204027678e-09,
"logits/chosen": -1.9837639331817627,
"logits/rejected": -1.9810622930526733,
"logps/chosen": -1.0128896236419678,
"logps/rejected": -1.2106821537017822,
"loss": 1.1023,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.0257792472839355,
"rewards/margins": 0.3955853283405304,
"rewards/rejected": -2.4213643074035645,
"step": 3650
},
{
"epoch": 2.636887608069164,
"grad_norm": 20.82292935112201,
"learning_rate": 2.199031051284972e-09,
"logits/chosen": -2.008237838745117,
"logits/rejected": -2.003821611404419,
"logps/chosen": -1.0695806741714478,
"logps/rejected": -1.1950201988220215,
"loss": 1.2201,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.1391613483428955,
"rewards/margins": 0.25087904930114746,
"rewards/rejected": -2.390040397644043,
"step": 3660
},
{
"epoch": 2.6440922190201728,
"grad_norm": 16.889253562953712,
"learning_rate": 2.113872712509254e-09,
"logits/chosen": -1.993787407875061,
"logits/rejected": -1.9862686395645142,
"logps/chosen": -1.1294848918914795,
"logps/rejected": -1.2411291599273682,
"loss": 1.2278,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.258969783782959,
"rewards/margins": 0.22328904271125793,
"rewards/rejected": -2.4822583198547363,
"step": 3670
},
{
"epoch": 2.6512968299711814,
"grad_norm": 14.064327131140953,
"learning_rate": 2.0303231739801143e-09,
"logits/chosen": -1.9686027765274048,
"logits/rejected": -1.957910180091858,
"logps/chosen": -1.0182335376739502,
"logps/rejected": -1.1589828729629517,
"loss": 1.182,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.0364670753479004,
"rewards/margins": 0.2814987301826477,
"rewards/rejected": -2.3179657459259033,
"step": 3680
},
{
"epoch": 2.65850144092219,
"grad_norm": 23.602606022846373,
"learning_rate": 1.948388308885102e-09,
"logits/chosen": -2.0372962951660156,
"logits/rejected": -2.0287270545959473,
"logps/chosen": -1.063071846961975,
"logps/rejected": -1.1734139919281006,
"loss": 1.2172,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.12614369392395,
"rewards/margins": 0.22068460285663605,
"rewards/rejected": -2.346827983856201,
"step": 3690
},
{
"epoch": 2.6657060518731988,
"grad_norm": 25.359683354788714,
"learning_rate": 1.86807387690692e-09,
"logits/chosen": -2.0645687580108643,
"logits/rejected": -2.061300754547119,
"logps/chosen": -1.0886929035186768,
"logps/rejected": -1.2760602235794067,
"loss": 1.1167,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.1773858070373535,
"rewards/margins": 0.37473443150520325,
"rewards/rejected": -2.5521204471588135,
"step": 3700
},
{
"epoch": 2.6729106628242074,
"grad_norm": 19.291599291445024,
"learning_rate": 1.789385523818493e-09,
"logits/chosen": -2.024766206741333,
"logits/rejected": -2.0262362957000732,
"logps/chosen": -1.0400424003601074,
"logps/rejected": -1.2077550888061523,
"loss": 1.1498,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.080084800720215,
"rewards/margins": 0.33542555570602417,
"rewards/rejected": -2.4155101776123047,
"step": 3710
},
{
"epoch": 2.680115273775216,
"grad_norm": 25.661768967793073,
"learning_rate": 1.712328781086131e-09,
"logits/chosen": -2.051741123199463,
"logits/rejected": -2.046497344970703,
"logps/chosen": -1.1223233938217163,
"logps/rejected": -1.2178256511688232,
"loss": 1.2398,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -2.2446467876434326,
"rewards/margins": 0.19100406765937805,
"rewards/rejected": -2.4356513023376465,
"step": 3720
},
{
"epoch": 2.6873198847262247,
"grad_norm": 21.329919320711415,
"learning_rate": 1.6369090654806543e-09,
"logits/chosen": -2.0555293560028076,
"logits/rejected": -2.0489516258239746,
"logps/chosen": -1.0201804637908936,
"logps/rejected": -1.163674235343933,
"loss": 1.1691,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.040360927581787,
"rewards/margins": 0.28698763251304626,
"rewards/rejected": -2.327348470687866,
"step": 3730
},
{
"epoch": 2.6945244956772334,
"grad_norm": 19.141258585549746,
"learning_rate": 1.5631316786966498e-09,
"logits/chosen": -1.9855105876922607,
"logits/rejected": -1.978864312171936,
"logps/chosen": -1.0213624238967896,
"logps/rejected": -1.1611801385879517,
"loss": 1.1975,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -2.042724847793579,
"rewards/margins": 0.27963531017303467,
"rewards/rejected": -2.3223602771759033,
"step": 3740
},
{
"epoch": 2.701729106628242,
"grad_norm": 18.560557441799045,
"learning_rate": 1.491001806979772e-09,
"logits/chosen": -2.0349512100219727,
"logits/rejected": -2.028040647506714,
"logps/chosen": -1.0765492916107178,
"logps/rejected": -1.225642442703247,
"loss": 1.1737,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.1530985832214355,
"rewards/margins": 0.2981860637664795,
"rewards/rejected": -2.451284885406494,
"step": 3750
},
{
"epoch": 2.7089337175792507,
"grad_norm": 29.217027349904072,
"learning_rate": 1.4205245207621508e-09,
"logits/chosen": -1.9789804220199585,
"logits/rejected": -1.9764864444732666,
"logps/chosen": -1.1173272132873535,
"logps/rejected": -1.2862274646759033,
"loss": 1.1533,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.234654426574707,
"rewards/margins": 0.337800532579422,
"rewards/rejected": -2.5724549293518066,
"step": 3760
},
{
"epoch": 2.7161383285302594,
"grad_norm": 17.685903552737507,
"learning_rate": 1.3517047743059978e-09,
"logits/chosen": -2.0163912773132324,
"logits/rejected": -2.0196452140808105,
"logps/chosen": -1.073099136352539,
"logps/rejected": -1.2338473796844482,
"loss": 1.1562,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.146198272705078,
"rewards/margins": 0.3214961588382721,
"rewards/rejected": -2.4676947593688965,
"step": 3770
},
{
"epoch": 2.723342939481268,
"grad_norm": 17.086412835946355,
"learning_rate": 1.2845474053553156e-09,
"logits/chosen": -2.011781692504883,
"logits/rejected": -2.007628917694092,
"logps/chosen": -1.0312448740005493,
"logps/rejected": -1.1682651042938232,
"loss": 1.2025,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -2.0624897480010986,
"rewards/margins": 0.27404046058654785,
"rewards/rejected": -2.3365302085876465,
"step": 3780
},
{
"epoch": 2.7305475504322767,
"grad_norm": 22.568118558934447,
"learning_rate": 1.2190571347958422e-09,
"logits/chosen": -2.0422775745391846,
"logits/rejected": -2.0436275005340576,
"logps/chosen": -0.9664519429206848,
"logps/rejected": -1.167764663696289,
"loss": 1.1102,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.9329038858413696,
"rewards/margins": 0.40262526273727417,
"rewards/rejected": -2.335529327392578,
"step": 3790
},
{
"epoch": 2.7377521613832854,
"grad_norm": 18.280458594650423,
"learning_rate": 1.1552385663231634e-09,
"logits/chosen": -1.9983785152435303,
"logits/rejected": -1.9888427257537842,
"logps/chosen": -1.0933729410171509,
"logps/rejected": -1.1888386011123657,
"loss": 1.2396,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -2.1867458820343018,
"rewards/margins": 0.19093120098114014,
"rewards/rejected": -2.3776772022247314,
"step": 3800
},
{
"epoch": 2.744956772334294,
"grad_norm": 19.071247762838464,
"learning_rate": 1.0930961861191302e-09,
"logits/chosen": -1.9584522247314453,
"logits/rejected": -1.9630699157714844,
"logps/chosen": -1.0375924110412598,
"logps/rejected": -1.1800395250320435,
"loss": 1.2,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.0751848220825195,
"rewards/margins": 0.28489404916763306,
"rewards/rejected": -2.360079050064087,
"step": 3810
},
{
"epoch": 2.7521613832853027,
"grad_norm": 16.698045888323442,
"learning_rate": 1.0326343625364608e-09,
"logits/chosen": -1.9668670892715454,
"logits/rejected": -1.9615176916122437,
"logps/chosen": -1.040906310081482,
"logps/rejected": -1.2131140232086182,
"loss": 1.1383,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.081812620162964,
"rewards/margins": 0.3444153070449829,
"rewards/rejected": -2.4262280464172363,
"step": 3820
},
{
"epoch": 2.7593659942363113,
"grad_norm": 18.443004885087902,
"learning_rate": 9.738573457917066e-10,
"logits/chosen": -2.0455007553100586,
"logits/rejected": -2.0438742637634277,
"logps/chosen": -1.0494908094406128,
"logps/rejected": -1.2397874593734741,
"loss": 1.1107,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.0989816188812256,
"rewards/margins": 0.3805932402610779,
"rewards/rejected": -2.4795749187469482,
"step": 3830
},
{
"epoch": 2.76657060518732,
"grad_norm": 18.699877901633396,
"learning_rate": 9.16769267666434e-10,
"logits/chosen": -2.012563705444336,
"logits/rejected": -2.0103211402893066,
"logps/chosen": -1.0743488073349,
"logps/rejected": -1.1493780612945557,
"loss": 1.2646,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -2.1486976146698,
"rewards/margins": 0.1500580608844757,
"rewards/rejected": -2.2987561225891113,
"step": 3840
},
{
"epoch": 2.7737752161383287,
"grad_norm": 19.950649690108477,
"learning_rate": 8.613741412168113e-10,
"logits/chosen": -2.024034261703491,
"logits/rejected": -2.023303270339966,
"logps/chosen": -1.0807751417160034,
"logps/rejected": -1.209153413772583,
"loss": 1.1806,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.161550283432007,
"rewards/margins": 0.2567565441131592,
"rewards/rejected": -2.418306827545166,
"step": 3850
},
{
"epoch": 2.7809798270893373,
"grad_norm": 19.864173044191876,
"learning_rate": 8.076758604914802e-10,
"logits/chosen": -1.9579622745513916,
"logits/rejected": -1.9533469676971436,
"logps/chosen": -0.9816028475761414,
"logps/rejected": -1.1145174503326416,
"loss": 1.1989,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9632056951522827,
"rewards/margins": 0.2658289074897766,
"rewards/rejected": -2.229034900665283,
"step": 3860
},
{
"epoch": 2.7881844380403455,
"grad_norm": 22.85284631990654,
"learning_rate": 7.55678200257856e-10,
"logits/chosen": -1.9850330352783203,
"logits/rejected": -1.9783369302749634,
"logps/chosen": -1.032204031944275,
"logps/rejected": -1.1751976013183594,
"loss": 1.1758,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.06440806388855,
"rewards/margins": 0.28598710894584656,
"rewards/rejected": -2.3503952026367188,
"step": 3870
},
{
"epoch": 2.795389048991354,
"grad_norm": 17.089433378243903,
"learning_rate": 7.053848157367315e-10,
"logits/chosen": -2.0007100105285645,
"logits/rejected": -1.9952503442764282,
"logps/chosen": -1.0419762134552002,
"logps/rejected": -1.1898977756500244,
"loss": 1.1845,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.0839524269104004,
"rewards/margins": 0.2958431541919708,
"rewards/rejected": -2.379795551300049,
"step": 3880
},
{
"epoch": 2.802593659942363,
"grad_norm": 15.839145055220296,
"learning_rate": 6.567992423453794e-10,
"logits/chosen": -2.015761375427246,
"logits/rejected": -2.0144906044006348,
"logps/chosen": -0.9628134965896606,
"logps/rejected": -1.0785901546478271,
"loss": 1.2027,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.9256269931793213,
"rewards/margins": 0.23155340552330017,
"rewards/rejected": -2.1571803092956543,
"step": 3890
},
{
"epoch": 2.8097982708933715,
"grad_norm": 19.561769738169332,
"learning_rate": 6.099248954489794e-10,
"logits/chosen": -1.9572585821151733,
"logits/rejected": -1.9550421237945557,
"logps/chosen": -1.0681164264678955,
"logps/rejected": -1.228930115699768,
"loss": 1.157,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.136232852935791,
"rewards/margins": 0.321627140045166,
"rewards/rejected": -2.457860231399536,
"step": 3900
},
{
"epoch": 2.81700288184438,
"grad_norm": 22.777244684824645,
"learning_rate": 5.647650701205653e-10,
"logits/chosen": -2.024953842163086,
"logits/rejected": -2.016838550567627,
"logps/chosen": -1.1103287935256958,
"logps/rejected": -1.2667860984802246,
"loss": 1.1763,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -2.2206575870513916,
"rewards/margins": 0.31291496753692627,
"rewards/rejected": -2.533572196960449,
"step": 3910
},
{
"epoch": 2.824207492795389,
"grad_norm": 16.303046104621277,
"learning_rate": 5.213229409093856e-10,
"logits/chosen": -2.0344924926757812,
"logits/rejected": -2.0291943550109863,
"logps/chosen": -1.0526678562164307,
"logps/rejected": -1.1854225397109985,
"loss": 1.1995,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -2.1053357124328613,
"rewards/margins": 0.2655092179775238,
"rewards/rejected": -2.370845079421997,
"step": 3920
},
{
"epoch": 2.8314121037463975,
"grad_norm": 20.917034597157283,
"learning_rate": 4.796015616177401e-10,
"logits/chosen": -1.9968492984771729,
"logits/rejected": -1.9910831451416016,
"logps/chosen": -1.0663249492645264,
"logps/rejected": -1.1775870323181152,
"loss": 1.2153,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.1326498985290527,
"rewards/margins": 0.22252389788627625,
"rewards/rejected": -2.3551740646362305,
"step": 3930
},
{
"epoch": 2.838616714697406,
"grad_norm": 16.746348704994205,
"learning_rate": 4.3960386508631595e-10,
"logits/chosen": -1.9347660541534424,
"logits/rejected": -1.9273744821548462,
"logps/chosen": -0.9666959643363953,
"logps/rejected": -1.0863577127456665,
"loss": 1.2257,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9333919286727905,
"rewards/margins": 0.23932373523712158,
"rewards/rejected": -2.172715425491333,
"step": 3940
},
{
"epoch": 2.845821325648415,
"grad_norm": 35.59868847583538,
"learning_rate": 4.013326629880243e-10,
"logits/chosen": -1.9773681163787842,
"logits/rejected": -1.9677212238311768,
"logps/chosen": -1.1061880588531494,
"logps/rejected": -1.2337268590927124,
"loss": 1.2042,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.212376117706299,
"rewards/margins": 0.2550778090953827,
"rewards/rejected": -2.467453718185425,
"step": 3950
},
{
"epoch": 2.8530259365994235,
"grad_norm": 19.68089141096437,
"learning_rate": 3.64790645630339e-10,
"logits/chosen": -1.9358896017074585,
"logits/rejected": -1.9352163076400757,
"logps/chosen": -1.0549217462539673,
"logps/rejected": -1.1247824430465698,
"loss": 1.2634,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1098434925079346,
"rewards/margins": 0.13972175121307373,
"rewards/rejected": -2.2495648860931396,
"step": 3960
},
{
"epoch": 2.860230547550432,
"grad_norm": 21.305782765629946,
"learning_rate": 3.2998038176619e-10,
"logits/chosen": -1.9779675006866455,
"logits/rejected": -1.9694792032241821,
"logps/chosen": -1.0569480657577515,
"logps/rejected": -1.1792137622833252,
"loss": 1.2079,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -2.113896131515503,
"rewards/margins": 0.24453163146972656,
"rewards/rejected": -2.3584275245666504,
"step": 3970
},
{
"epoch": 2.867435158501441,
"grad_norm": 20.77606365899596,
"learning_rate": 2.969043184133907e-10,
"logits/chosen": -2.0462427139282227,
"logits/rejected": -2.0448436737060547,
"logps/chosen": -0.9707571268081665,
"logps/rejected": -1.1868783235549927,
"loss": 1.0771,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.941514253616333,
"rewards/margins": 0.4322422444820404,
"rewards/rejected": -2.3737566471099854,
"step": 3980
},
{
"epoch": 2.8746397694524495,
"grad_norm": 17.888841507835842,
"learning_rate": 2.6556478068261447e-10,
"logits/chosen": -1.9706792831420898,
"logits/rejected": -1.968033790588379,
"logps/chosen": -0.9736353158950806,
"logps/rejected": -1.1013442277908325,
"loss": 1.2103,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -1.9472706317901611,
"rewards/margins": 0.25541773438453674,
"rewards/rejected": -2.202688455581665,
"step": 3990
},
{
"epoch": 2.881844380403458,
"grad_norm": 20.827982084787728,
"learning_rate": 2.3596397161395607e-10,
"logits/chosen": -2.0459811687469482,
"logits/rejected": -2.0342445373535156,
"logps/chosen": -1.0675297975540161,
"logps/rejected": -1.2322208881378174,
"loss": 1.1585,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.1350595951080322,
"rewards/margins": 0.32938265800476074,
"rewards/rejected": -2.4644417762756348,
"step": 4000
},
{
"epoch": 2.889048991354467,
"grad_norm": 26.363176469996105,
"learning_rate": 2.0810397202206399e-10,
"logits/chosen": -1.9519503116607666,
"logits/rejected": -1.9571945667266846,
"logps/chosen": -1.0638792514801025,
"logps/rejected": -1.1936867237091064,
"loss": 1.1899,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -2.127758502960205,
"rewards/margins": 0.25961530208587646,
"rewards/rejected": -2.387373447418213,
"step": 4010
},
{
"epoch": 2.8962536023054755,
"grad_norm": 22.435915119282647,
"learning_rate": 1.819867403498737e-10,
"logits/chosen": -2.0360004901885986,
"logits/rejected": -2.0333070755004883,
"logps/chosen": -1.0682737827301025,
"logps/rejected": -1.1997547149658203,
"loss": 1.2021,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.136547565460205,
"rewards/margins": 0.262962281703949,
"rewards/rejected": -2.3995094299316406,
"step": 4020
},
{
"epoch": 2.903458213256484,
"grad_norm": 21.73593175736094,
"learning_rate": 1.5761411253092382e-10,
"logits/chosen": -1.9650490283966064,
"logits/rejected": -1.9548593759536743,
"logps/chosen": -0.9868356585502625,
"logps/rejected": -1.1082721948623657,
"loss": 1.2004,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.973671317100525,
"rewards/margins": 0.24287304282188416,
"rewards/rejected": -2.2165443897247314,
"step": 4030
},
{
"epoch": 2.910662824207493,
"grad_norm": 20.195616773045355,
"learning_rate": 1.3498780186031455e-10,
"logits/chosen": -2.0080840587615967,
"logits/rejected": -2.0045909881591797,
"logps/chosen": -1.161108136177063,
"logps/rejected": -1.2804962396621704,
"loss": 1.2261,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -2.322216272354126,
"rewards/margins": 0.23877570033073425,
"rewards/rejected": -2.560992479324341,
"step": 4040
},
{
"epoch": 2.9178674351585014,
"grad_norm": 15.481345158642924,
"learning_rate": 1.1410939887425141e-10,
"logits/chosen": -2.0009922981262207,
"logits/rejected": -2.002737522125244,
"logps/chosen": -1.044710397720337,
"logps/rejected": -1.1732409000396729,
"loss": 1.2109,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.089420795440674,
"rewards/margins": 0.2570609450340271,
"rewards/rejected": -2.3464818000793457,
"step": 4050
},
{
"epoch": 2.92507204610951,
"grad_norm": 18.467501751315456,
"learning_rate": 9.498037123825686e-11,
"logits/chosen": -2.008939743041992,
"logits/rejected": -2.0055575370788574,
"logps/chosen": -1.0212924480438232,
"logps/rejected": -1.1463903188705444,
"loss": 1.1994,
"rewards/accuracies": 0.59375,
"rewards/chosen": -2.0425848960876465,
"rewards/margins": 0.2501956522464752,
"rewards/rejected": -2.292780637741089,
"step": 4060
},
{
"epoch": 2.9322766570605188,
"grad_norm": 21.297612250322334,
"learning_rate": 7.760206364398614e-11,
"logits/chosen": -2.0672459602355957,
"logits/rejected": -2.0643718242645264,
"logps/chosen": -1.0762133598327637,
"logps/rejected": -1.2181751728057861,
"loss": 1.1851,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.1524267196655273,
"rewards/margins": 0.283923476934433,
"rewards/rejected": -2.4363503456115723,
"step": 4070
},
{
"epoch": 2.9394812680115274,
"grad_norm": 21.284898715739565,
"learning_rate": 6.19756977147029e-11,
"logits/chosen": -1.9935872554779053,
"logits/rejected": -1.990228295326233,
"logps/chosen": -1.0278215408325195,
"logps/rejected": -1.2337336540222168,
"loss": 1.1105,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.055643081665039,
"rewards/margins": 0.4118243157863617,
"rewards/rejected": -2.4674673080444336,
"step": 4080
},
{
"epoch": 2.946685878962536,
"grad_norm": 20.87599043089686,
"learning_rate": 4.810237191940625e-11,
"logits/chosen": -1.976012945175171,
"logits/rejected": -1.97482168674469,
"logps/chosen": -1.038079857826233,
"logps/rejected": -1.1688039302825928,
"loss": 1.2176,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -2.076159715652466,
"rewards/margins": 0.2614482045173645,
"rewards/rejected": -2.3376078605651855,
"step": 4090
},
{
"epoch": 2.9538904899135447,
"grad_norm": 20.210725111790943,
"learning_rate": 3.5983061495617476e-11,
"logits/chosen": -2.032045841217041,
"logits/rejected": -2.0321781635284424,
"logps/chosen": -1.1231155395507812,
"logps/rejected": -1.270812749862671,
"loss": 1.1826,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.2462310791015625,
"rewards/margins": 0.295394629240036,
"rewards/rejected": -2.541625499725342,
"step": 4100
},
{
"epoch": 2.9610951008645534,
"grad_norm": 21.355044904847375,
"learning_rate": 2.5618618380812694e-11,
"logits/chosen": -2.018833637237549,
"logits/rejected": -2.008314371109009,
"logps/chosen": -1.0017220973968506,
"logps/rejected": -1.1643407344818115,
"loss": 1.1725,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -2.003444194793701,
"rewards/margins": 0.32523733377456665,
"rewards/rejected": -2.328681468963623,
"step": 4110
},
{
"epoch": 2.968299711815562,
"grad_norm": 22.742418454002138,
"learning_rate": 1.700977115254576e-11,
"logits/chosen": -1.9938160181045532,
"logits/rejected": -1.9904816150665283,
"logps/chosen": -0.9965242147445679,
"logps/rejected": -1.1439779996871948,
"loss": 1.1688,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.9930484294891357,
"rewards/margins": 0.29490748047828674,
"rewards/rejected": -2.2879559993743896,
"step": 4120
},
{
"epoch": 2.9755043227665707,
"grad_norm": 20.73702179826127,
"learning_rate": 1.0157124977230868e-11,
"logits/chosen": -1.9752233028411865,
"logits/rejected": -1.9736427068710327,
"logps/chosen": -0.9688889384269714,
"logps/rejected": -1.117297649383545,
"loss": 1.1686,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.9377778768539429,
"rewards/margins": 0.2968176603317261,
"rewards/rejected": -2.23459529876709,
"step": 4130
},
{
"epoch": 2.9827089337175794,
"grad_norm": 21.969537635189976,
"learning_rate": 5.061161567596061e-12,
"logits/chosen": -1.9961363077163696,
"logits/rejected": -1.9917857646942139,
"logps/chosen": -1.0561162233352661,
"logps/rejected": -1.1413224935531616,
"loss": 1.261,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -2.1122324466705322,
"rewards/margins": 0.17041274905204773,
"rewards/rejected": -2.2826449871063232,
"step": 4140
},
{
"epoch": 2.989913544668588,
"grad_norm": 20.739431552772885,
"learning_rate": 1.7222391488297406e-12,
"logits/chosen": -2.016247510910034,
"logits/rejected": -2.0124571323394775,
"logps/chosen": -1.1069406270980835,
"logps/rejected": -1.2540051937103271,
"loss": 1.1761,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.213881254196167,
"rewards/margins": 0.2941294014453888,
"rewards/rejected": -2.5080103874206543,
"step": 4150
},
{
"epoch": 2.9971181556195967,
"grad_norm": 19.721112564343052,
"learning_rate": 1.4059243338693238e-13,
"logits/chosen": -1.9906642436981201,
"logits/rejected": -1.9835201501846313,
"logps/chosen": -1.0591213703155518,
"logps/rejected": -1.182531714439392,
"loss": 1.1942,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.1182427406311035,
"rewards/margins": 0.24682076275348663,
"rewards/rejected": -2.365063428878784,
"step": 4160
},
{
"epoch": 3.0,
"step": 4164,
"total_flos": 0.0,
"train_loss": 1.2025406490027275,
"train_runtime": 5488.6376,
"train_samples_per_second": 12.135,
"train_steps_per_second": 0.759
}
],
"logging_steps": 10,
"max_steps": 4164,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}