Checkpoint3200 / trainer_state.json
DungND1107's picture
Upload 15 files
764474b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2809913726867605,
"eval_steps": 100,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008780980396461265,
"grad_norm": 6.670812129974365,
"learning_rate": 1.0000000000000001e-07,
"logits/chosen": 4.2431488037109375,
"logits/rejected": 4.231738567352295,
"logps/chosen": -9.991304397583008,
"logps/rejected": -10.524327278137207,
"loss": 3.0309,
"nll_loss": 2.3641905784606934,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.99739146232605,
"rewards/margins": 0.15990665555000305,
"rewards/rejected": -3.1572983264923096,
"step": 10
},
{
"epoch": 0.001756196079292253,
"grad_norm": 10.682082176208496,
"learning_rate": 2.1111111111111113e-07,
"logits/chosen": 4.309535980224609,
"logits/rejected": 4.361606597900391,
"logps/chosen": -9.786101341247559,
"logps/rejected": -10.518722534179688,
"loss": 3.1598,
"nll_loss": 2.5160868167877197,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.9358303546905518,
"rewards/margins": 0.2197863757610321,
"rewards/rejected": -3.1556167602539062,
"step": 20
},
{
"epoch": 0.0026342941189383797,
"grad_norm": 15.04028034210205,
"learning_rate": 3.2222222222222227e-07,
"logits/chosen": 4.225083351135254,
"logits/rejected": 4.215059757232666,
"logps/chosen": -9.968446731567383,
"logps/rejected": -10.634344100952148,
"loss": 2.9674,
"nll_loss": 2.318664073944092,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.9905343055725098,
"rewards/margins": 0.19976934790611267,
"rewards/rejected": -3.1903038024902344,
"step": 30
},
{
"epoch": 0.003512392158584506,
"grad_norm": 10.793362617492676,
"learning_rate": 4.333333333333334e-07,
"logits/chosen": 4.328730583190918,
"logits/rejected": 4.321128845214844,
"logps/chosen": -9.762226104736328,
"logps/rejected": -10.319639205932617,
"loss": 2.7628,
"nll_loss": 2.110645294189453,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.928668260574341,
"rewards/margins": 0.1672237515449524,
"rewards/rejected": -3.0958914756774902,
"step": 40
},
{
"epoch": 0.004390490198230633,
"grad_norm": 6.863041400909424,
"learning_rate": 5.444444444444444e-07,
"logits/chosen": 4.535063743591309,
"logits/rejected": 4.555140495300293,
"logps/chosen": -10.173809051513672,
"logps/rejected": -10.436089515686035,
"loss": 2.7587,
"nll_loss": 2.0572845935821533,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -3.052142858505249,
"rewards/margins": 0.07868396490812302,
"rewards/rejected": -3.130826711654663,
"step": 50
},
{
"epoch": 0.005268588237876759,
"grad_norm": 11.63379955291748,
"learning_rate": 6.555555555555556e-07,
"logits/chosen": 4.210951805114746,
"logits/rejected": 4.193416595458984,
"logps/chosen": -9.710775375366211,
"logps/rejected": -10.588180541992188,
"loss": 3.023,
"nll_loss": 2.409853458404541,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.9132332801818848,
"rewards/margins": 0.2632210850715637,
"rewards/rejected": -3.1764540672302246,
"step": 60
},
{
"epoch": 0.006146686277522885,
"grad_norm": 4.952016830444336,
"learning_rate": 7.666666666666667e-07,
"logits/chosen": 4.56010627746582,
"logits/rejected": 4.571717739105225,
"logps/chosen": -9.788490295410156,
"logps/rejected": -10.214717864990234,
"loss": 2.6203,
"nll_loss": 1.9469432830810547,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.936546802520752,
"rewards/margins": 0.12786847352981567,
"rewards/rejected": -3.064415454864502,
"step": 70
},
{
"epoch": 0.007024784317169012,
"grad_norm": 10.809489250183105,
"learning_rate": 8.777777777777778e-07,
"logits/chosen": 4.2120256423950195,
"logits/rejected": 4.222228050231934,
"logps/chosen": -9.806170463562012,
"logps/rejected": -10.296957969665527,
"loss": 3.3042,
"nll_loss": 2.644357681274414,
"rewards/accuracies": 0.5625,
"rewards/chosen": -2.9418509006500244,
"rewards/margins": 0.1472366452217102,
"rewards/rejected": -3.08908748626709,
"step": 80
},
{
"epoch": 0.007902882356815138,
"grad_norm": 9.27618408203125,
"learning_rate": 9.88888888888889e-07,
"logits/chosen": 4.336479187011719,
"logits/rejected": 4.3232035636901855,
"logps/chosen": -9.828582763671875,
"logps/rejected": -10.485132217407227,
"loss": 3.1223,
"nll_loss": 2.4759140014648438,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -2.948575496673584,
"rewards/margins": 0.1969645768404007,
"rewards/rejected": -3.1455399990081787,
"step": 90
},
{
"epoch": 0.008780980396461266,
"grad_norm": 19.710798263549805,
"learning_rate": 1.1e-06,
"logits/chosen": 4.149864673614502,
"logits/rejected": 4.147267818450928,
"logps/chosen": -9.967988014221191,
"logps/rejected": -10.663251876831055,
"loss": 2.9038,
"nll_loss": 2.2608590126037598,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.990396499633789,
"rewards/margins": 0.20857906341552734,
"rewards/rejected": -3.1989755630493164,
"step": 100
},
{
"epoch": 0.009659078436107391,
"grad_norm": 10.258548736572266,
"learning_rate": 1.2111111111111111e-06,
"logits/chosen": 4.241273880004883,
"logits/rejected": 4.249630928039551,
"logps/chosen": -9.708308219909668,
"logps/rejected": -10.221686363220215,
"loss": 3.2698,
"nll_loss": 2.603203535079956,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.912492513656616,
"rewards/margins": 0.15401321649551392,
"rewards/rejected": -3.0665059089660645,
"step": 110
},
{
"epoch": 0.010537176475753519,
"grad_norm": 3.6609084606170654,
"learning_rate": 1.3222222222222222e-06,
"logits/chosen": 4.420263290405273,
"logits/rejected": 4.413485527038574,
"logps/chosen": -9.733232498168945,
"logps/rejected": -10.436403274536133,
"loss": 2.5418,
"nll_loss": 1.9018337726593018,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.9199700355529785,
"rewards/margins": 0.21095120906829834,
"rewards/rejected": -3.1309211254119873,
"step": 120
},
{
"epoch": 0.011415274515399644,
"grad_norm": 10.068683624267578,
"learning_rate": 1.4333333333333335e-06,
"logits/chosen": 4.42335319519043,
"logits/rejected": 4.493862152099609,
"logps/chosen": -9.73039722442627,
"logps/rejected": -10.323850631713867,
"loss": 2.5285,
"nll_loss": 1.8774116039276123,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.919119358062744,
"rewards/margins": 0.17803625762462616,
"rewards/rejected": -3.0971553325653076,
"step": 130
},
{
"epoch": 0.01229337255504577,
"grad_norm": 11.362488746643066,
"learning_rate": 1.5444444444444446e-06,
"logits/chosen": 4.332414150238037,
"logits/rejected": 4.329668045043945,
"logps/chosen": -9.279766082763672,
"logps/rejected": -9.99905014038086,
"loss": 2.7181,
"nll_loss": 2.0923380851745605,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.7839295864105225,
"rewards/margins": 0.21578574180603027,
"rewards/rejected": -2.9997153282165527,
"step": 140
},
{
"epoch": 0.013171470594691898,
"grad_norm": 14.254817008972168,
"learning_rate": 1.6555555555555559e-06,
"logits/chosen": 4.319321155548096,
"logits/rejected": 4.321578502655029,
"logps/chosen": -9.059564590454102,
"logps/rejected": -9.602411270141602,
"loss": 2.8514,
"nll_loss": 2.1910147666931152,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.717869758605957,
"rewards/margins": 0.16285373270511627,
"rewards/rejected": -2.880723476409912,
"step": 150
},
{
"epoch": 0.014049568634338023,
"grad_norm": 4.008646488189697,
"learning_rate": 1.7666666666666668e-06,
"logits/chosen": 4.2461042404174805,
"logits/rejected": 4.25100040435791,
"logps/chosen": -8.857942581176758,
"logps/rejected": -9.37680435180664,
"loss": 2.5006,
"nll_loss": 1.839271903038025,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -2.6573829650878906,
"rewards/margins": 0.15565846860408783,
"rewards/rejected": -2.8130412101745605,
"step": 160
},
{
"epoch": 0.01492766667398415,
"grad_norm": 10.644233703613281,
"learning_rate": 1.8777777777777778e-06,
"logits/chosen": 4.304837226867676,
"logits/rejected": 4.3417558670043945,
"logps/chosen": -8.620405197143555,
"logps/rejected": -9.385229110717773,
"loss": 2.5802,
"nll_loss": 1.932861328125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.5861220359802246,
"rewards/margins": 0.22944733500480652,
"rewards/rejected": -2.8155694007873535,
"step": 170
},
{
"epoch": 0.015805764713630276,
"grad_norm": 9.301039695739746,
"learning_rate": 1.988888888888889e-06,
"logits/chosen": 4.26754903793335,
"logits/rejected": 4.271862030029297,
"logps/chosen": -8.370404243469238,
"logps/rejected": -8.689592361450195,
"loss": 2.4808,
"nll_loss": 1.7635695934295654,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -2.5111212730407715,
"rewards/margins": 0.09575649350881577,
"rewards/rejected": -2.6068778038024902,
"step": 180
},
{
"epoch": 0.016683862753276404,
"grad_norm": 6.643105506896973,
"learning_rate": 2.1000000000000002e-06,
"logits/chosen": 4.255660533905029,
"logits/rejected": 4.268857955932617,
"logps/chosen": -7.950819492340088,
"logps/rejected": -8.541925430297852,
"loss": 2.2629,
"nll_loss": 1.6152054071426392,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.3852455615997314,
"rewards/margins": 0.17733201384544373,
"rewards/rejected": -2.562577724456787,
"step": 190
},
{
"epoch": 0.01756196079292253,
"grad_norm": 9.348048210144043,
"learning_rate": 2.2111111111111113e-06,
"logits/chosen": 4.450976848602295,
"logits/rejected": 4.4492387771606445,
"logps/chosen": -7.3747453689575195,
"logps/rejected": -8.266874313354492,
"loss": 2.0158,
"nll_loss": 1.3951395750045776,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.212423801422119,
"rewards/margins": 0.2676388621330261,
"rewards/rejected": -2.480062484741211,
"step": 200
},
{
"epoch": 0.018440058832568655,
"grad_norm": 5.35267972946167,
"learning_rate": 2.3222222222222224e-06,
"logits/chosen": 4.387726783752441,
"logits/rejected": 4.407253265380859,
"logps/chosen": -6.376626491546631,
"logps/rejected": -7.535942077636719,
"loss": 1.7794,
"nll_loss": 1.1858270168304443,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9129879474639893,
"rewards/margins": 0.3477945923805237,
"rewards/rejected": -2.2607827186584473,
"step": 210
},
{
"epoch": 0.019318156872214783,
"grad_norm": 8.461145401000977,
"learning_rate": 2.4333333333333335e-06,
"logits/chosen": 4.57470703125,
"logits/rejected": 4.573002815246582,
"logps/chosen": -5.478797912597656,
"logps/rejected": -6.914730072021484,
"loss": 1.4017,
"nll_loss": 0.8511344194412231,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.643639326095581,
"rewards/margins": 0.4307795464992523,
"rewards/rejected": -2.0744190216064453,
"step": 220
},
{
"epoch": 0.02019625491186091,
"grad_norm": 5.639391899108887,
"learning_rate": 2.5444444444444446e-06,
"logits/chosen": 4.498848915100098,
"logits/rejected": 4.52827262878418,
"logps/chosen": -4.859742164611816,
"logps/rejected": -5.578665733337402,
"loss": 1.4217,
"nll_loss": 0.754524827003479,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.4579226970672607,
"rewards/margins": 0.21567705273628235,
"rewards/rejected": -1.6735999584197998,
"step": 230
},
{
"epoch": 0.021074352951507037,
"grad_norm": 5.855747699737549,
"learning_rate": 2.6555555555555556e-06,
"logits/chosen": 4.4441728591918945,
"logits/rejected": 4.421013832092285,
"logps/chosen": -3.958739757537842,
"logps/rejected": -4.544581413269043,
"loss": 1.3529,
"nll_loss": 0.6709738969802856,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.1876219511032104,
"rewards/margins": 0.17575237154960632,
"rewards/rejected": -1.3633743524551392,
"step": 240
},
{
"epoch": 0.02195245099115316,
"grad_norm": 5.383121490478516,
"learning_rate": 2.766666666666667e-06,
"logits/chosen": 4.187775611877441,
"logits/rejected": 4.246646404266357,
"logps/chosen": -2.965056896209717,
"logps/rejected": -3.8153586387634277,
"loss": 1.0879,
"nll_loss": 0.45148134231567383,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8895170092582703,
"rewards/margins": 0.255090594291687,
"rewards/rejected": -1.144607663154602,
"step": 250
},
{
"epoch": 0.02283054903079929,
"grad_norm": 4.042531967163086,
"learning_rate": 2.8777777777777782e-06,
"logits/chosen": 4.055316925048828,
"logits/rejected": 4.1006879806518555,
"logps/chosen": -2.189060688018799,
"logps/rejected": -2.7986176013946533,
"loss": 0.9665,
"nll_loss": 0.3139139711856842,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6567181348800659,
"rewards/margins": 0.1828671246767044,
"rewards/rejected": -0.8395851850509644,
"step": 260
},
{
"epoch": 0.023708647070445416,
"grad_norm": 3.6113719940185547,
"learning_rate": 2.988888888888889e-06,
"logits/chosen": 4.224070072174072,
"logits/rejected": 4.2020182609558105,
"logps/chosen": -1.5525212287902832,
"logps/rejected": -2.3060789108276367,
"loss": 0.7876,
"nll_loss": 0.17524096369743347,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.465756356716156,
"rewards/margins": 0.22606734931468964,
"rewards/rejected": -0.6918237805366516,
"step": 270
},
{
"epoch": 0.02458674511009154,
"grad_norm": 2.20628023147583,
"learning_rate": 3.1000000000000004e-06,
"logits/chosen": 3.9985511302948,
"logits/rejected": 3.9572558403015137,
"logps/chosen": -1.0324242115020752,
"logps/rejected": -1.434692621231079,
"loss": 0.8117,
"nll_loss": 0.1578795313835144,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.309727281332016,
"rewards/margins": 0.12068048864603043,
"rewards/rejected": -0.4304077625274658,
"step": 280
},
{
"epoch": 0.025464843149737668,
"grad_norm": 2.605509042739868,
"learning_rate": 3.2111111111111115e-06,
"logits/chosen": 3.9703261852264404,
"logits/rejected": 3.978794574737549,
"logps/chosen": -1.302750587463379,
"logps/rejected": -1.4635370969772339,
"loss": 0.8424,
"nll_loss": 0.15038228034973145,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.39082518219947815,
"rewards/margins": 0.048235934227705,
"rewards/rejected": -0.43906116485595703,
"step": 290
},
{
"epoch": 0.026342941189383795,
"grad_norm": 4.225305080413818,
"learning_rate": 3.322222222222222e-06,
"logits/chosen": 3.872316360473633,
"logits/rejected": 3.909350872039795,
"logps/chosen": -0.7655197381973267,
"logps/rejected": -1.3384641408920288,
"loss": 0.7091,
"nll_loss": 0.07751598209142685,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2296559065580368,
"rewards/margins": 0.17188331484794617,
"rewards/rejected": -0.40153923630714417,
"step": 300
},
{
"epoch": 0.027221039229029922,
"grad_norm": 2.167863607406616,
"learning_rate": 3.4333333333333336e-06,
"logits/chosen": 3.8423914909362793,
"logits/rejected": 3.8470401763916016,
"logps/chosen": -0.781024158000946,
"logps/rejected": -1.1648194789886475,
"loss": 0.7324,
"nll_loss": 0.07609061896800995,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.23430728912353516,
"rewards/margins": 0.11513856798410416,
"rewards/rejected": -0.3494458794593811,
"step": 310
},
{
"epoch": 0.028099137268676046,
"grad_norm": 2.662125587463379,
"learning_rate": 3.5444444444444447e-06,
"logits/chosen": 3.844832181930542,
"logits/rejected": 3.897855758666992,
"logps/chosen": -0.8712307214736938,
"logps/rejected": -1.2118942737579346,
"loss": 0.754,
"nll_loss": 0.09477487206459045,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.2613692581653595,
"rewards/margins": 0.10219905525445938,
"rewards/rejected": -0.3635682463645935,
"step": 320
},
{
"epoch": 0.028977235308322174,
"grad_norm": 3.6492230892181396,
"learning_rate": 3.6555555555555562e-06,
"logits/chosen": 3.791374683380127,
"logits/rejected": 3.823376417160034,
"logps/chosen": -0.8908417820930481,
"logps/rejected": -1.1247615814208984,
"loss": 0.7571,
"nll_loss": 0.07855098694562912,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.26725253462791443,
"rewards/margins": 0.07017592340707779,
"rewards/rejected": -0.337428480386734,
"step": 330
},
{
"epoch": 0.0298553333479683,
"grad_norm": 4.195188999176025,
"learning_rate": 3.766666666666667e-06,
"logits/chosen": 3.7930731773376465,
"logits/rejected": 3.771570920944214,
"logps/chosen": -0.5707345008850098,
"logps/rejected": -1.0306470394134521,
"loss": 0.7241,
"nll_loss": 0.07002006471157074,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1712203472852707,
"rewards/margins": 0.13797374069690704,
"rewards/rejected": -0.30919408798217773,
"step": 340
},
{
"epoch": 0.03073343138761443,
"grad_norm": 6.730047225952148,
"learning_rate": 3.877777777777778e-06,
"logits/chosen": 3.7914137840270996,
"logits/rejected": 3.830873966217041,
"logps/chosen": -0.48989325761795044,
"logps/rejected": -1.329006552696228,
"loss": 0.6549,
"nll_loss": 0.04915159195661545,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.14696797728538513,
"rewards/margins": 0.2517339587211609,
"rewards/rejected": -0.3987019658088684,
"step": 350
},
{
"epoch": 0.03161152942726055,
"grad_norm": 5.190159320831299,
"learning_rate": 3.9888888888888895e-06,
"logits/chosen": 3.5701937675476074,
"logits/rejected": 3.623018264770508,
"logps/chosen": -1.0031934976577759,
"logps/rejected": -1.3071503639221191,
"loss": 0.7632,
"nll_loss": 0.07658834755420685,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.3009580075740814,
"rewards/margins": 0.09118713438510895,
"rewards/rejected": -0.39214515686035156,
"step": 360
},
{
"epoch": 0.03248962746690668,
"grad_norm": 6.694168567657471,
"learning_rate": 4.1e-06,
"logits/chosen": 3.823359727859497,
"logits/rejected": 3.852163314819336,
"logps/chosen": -0.6341744661331177,
"logps/rejected": -1.3019847869873047,
"loss": 0.7057,
"nll_loss": 0.07171504944562912,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.19025234878063202,
"rewards/margins": 0.20034310221672058,
"rewards/rejected": -0.3905954957008362,
"step": 370
},
{
"epoch": 0.03336772550655281,
"grad_norm": 7.252128601074219,
"learning_rate": 4.211111111111112e-06,
"logits/chosen": 3.7956886291503906,
"logits/rejected": 3.795431137084961,
"logps/chosen": -0.7523115277290344,
"logps/rejected": -1.489225149154663,
"loss": 0.6923,
"nll_loss": 0.07404422760009766,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2256934642791748,
"rewards/margins": 0.22107413411140442,
"rewards/rejected": -0.4467676281929016,
"step": 380
},
{
"epoch": 0.034245823546198935,
"grad_norm": 2.945425271987915,
"learning_rate": 4.322222222222223e-06,
"logits/chosen": 3.5031909942626953,
"logits/rejected": 3.5364387035369873,
"logps/chosen": -0.6282280087471008,
"logps/rejected": -1.3442871570587158,
"loss": 0.7103,
"nll_loss": 0.06170845031738281,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.18846839666366577,
"rewards/margins": 0.21481776237487793,
"rewards/rejected": -0.4032861292362213,
"step": 390
},
{
"epoch": 0.03512392158584506,
"grad_norm": 6.988142490386963,
"learning_rate": 4.433333333333334e-06,
"logits/chosen": 3.9105467796325684,
"logits/rejected": 3.931438446044922,
"logps/chosen": -0.9674631357192993,
"logps/rejected": -1.5149281024932861,
"loss": 0.7768,
"nll_loss": 0.11233203113079071,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.2902389466762543,
"rewards/margins": 0.1642395257949829,
"rewards/rejected": -0.45447850227355957,
"step": 400
},
{
"epoch": 0.03600201962549118,
"grad_norm": 0.2611992359161377,
"learning_rate": 4.544444444444445e-06,
"logits/chosen": 3.6991469860076904,
"logits/rejected": 3.726545810699463,
"logps/chosen": -0.5881733894348145,
"logps/rejected": -1.2820765972137451,
"loss": 0.6929,
"nll_loss": 0.06608637422323227,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.17645201086997986,
"rewards/margins": 0.20817098021507263,
"rewards/rejected": -0.38462305068969727,
"step": 410
},
{
"epoch": 0.03688011766513731,
"grad_norm": 3.019066095352173,
"learning_rate": 4.655555555555556e-06,
"logits/chosen": 3.4919254779815674,
"logits/rejected": 3.5151939392089844,
"logps/chosen": -0.5710722804069519,
"logps/rejected": -1.1787471771240234,
"loss": 0.712,
"nll_loss": 0.05282425880432129,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.17132170498371124,
"rewards/margins": 0.18230250477790833,
"rewards/rejected": -0.353624165058136,
"step": 420
},
{
"epoch": 0.03775821570478344,
"grad_norm": 2.8214099407196045,
"learning_rate": 4.766666666666667e-06,
"logits/chosen": 3.7750792503356934,
"logits/rejected": 3.7456068992614746,
"logps/chosen": -0.7161394357681274,
"logps/rejected": -1.8331083059310913,
"loss": 0.6897,
"nll_loss": 0.09056379646062851,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2148418426513672,
"rewards/margins": 0.335090696811676,
"rewards/rejected": -0.5499325394630432,
"step": 430
},
{
"epoch": 0.038636313744429565,
"grad_norm": 5.72930383682251,
"learning_rate": 4.877777777777778e-06,
"logits/chosen": 3.5365283489227295,
"logits/rejected": 3.5479636192321777,
"logps/chosen": -0.7414464950561523,
"logps/rejected": -1.5791943073272705,
"loss": 0.6933,
"nll_loss": 0.0765593945980072,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.22243395447731018,
"rewards/margins": 0.2513243556022644,
"rewards/rejected": -0.4737583100795746,
"step": 440
},
{
"epoch": 0.03951441178407569,
"grad_norm": 2.2949461936950684,
"learning_rate": 4.988888888888889e-06,
"logits/chosen": 3.2915852069854736,
"logits/rejected": 3.309730052947998,
"logps/chosen": -0.5450859069824219,
"logps/rejected": -0.948569118976593,
"loss": 0.7334,
"nll_loss": 0.0632125660777092,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1635257601737976,
"rewards/margins": 0.12104494869709015,
"rewards/rejected": -0.28457072377204895,
"step": 450
},
{
"epoch": 0.04039250982372182,
"grad_norm": 7.028234481811523,
"learning_rate": 5.1e-06,
"logits/chosen": 3.5347137451171875,
"logits/rejected": 3.542628049850464,
"logps/chosen": -0.6212174296379089,
"logps/rejected": -1.3661630153656006,
"loss": 0.7226,
"nll_loss": 0.08348599821329117,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.18636523187160492,
"rewards/margins": 0.2234836369752884,
"rewards/rejected": -0.4098488688468933,
"step": 460
},
{
"epoch": 0.04127060786336795,
"grad_norm": 5.250203609466553,
"learning_rate": 5.211111111111111e-06,
"logits/chosen": 3.496631622314453,
"logits/rejected": 3.5331413745880127,
"logps/chosen": -0.6912875175476074,
"logps/rejected": -1.6311848163604736,
"loss": 0.6796,
"nll_loss": 0.062180064618587494,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20738628506660461,
"rewards/margins": 0.2819691598415375,
"rewards/rejected": -0.4893553853034973,
"step": 470
},
{
"epoch": 0.042148705903014075,
"grad_norm": 2.3006033897399902,
"learning_rate": 5.322222222222223e-06,
"logits/chosen": 3.731518268585205,
"logits/rejected": 3.775359630584717,
"logps/chosen": -0.8064204454421997,
"logps/rejected": -1.6423746347427368,
"loss": 0.7282,
"nll_loss": 0.09897418320178986,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2419261485338211,
"rewards/margins": 0.25078627467155457,
"rewards/rejected": -0.49271243810653687,
"step": 480
},
{
"epoch": 0.043026803942660195,
"grad_norm": 6.259355545043945,
"learning_rate": 5.4333333333333335e-06,
"logits/chosen": 3.3239219188690186,
"logits/rejected": 3.3287899494171143,
"logps/chosen": -0.623904824256897,
"logps/rejected": -1.5192670822143555,
"loss": 0.7155,
"nll_loss": 0.07001027464866638,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.18717142939567566,
"rewards/margins": 0.2686087191104889,
"rewards/rejected": -0.4557802081108093,
"step": 490
},
{
"epoch": 0.04390490198230632,
"grad_norm": 4.574368000030518,
"learning_rate": 5.544444444444445e-06,
"logits/chosen": 3.667168378829956,
"logits/rejected": 3.707645893096924,
"logps/chosen": -0.7253153920173645,
"logps/rejected": -1.8622252941131592,
"loss": 0.6561,
"nll_loss": 0.06381665915250778,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.21759465336799622,
"rewards/margins": 0.34107303619384766,
"rewards/rejected": -0.5586676597595215,
"step": 500
},
{
"epoch": 0.04478300002195245,
"grad_norm": 36.264381408691406,
"learning_rate": 5.6555555555555566e-06,
"logits/chosen": 3.398568630218506,
"logits/rejected": 3.468022108078003,
"logps/chosen": -0.5565214157104492,
"logps/rejected": -1.1638367176055908,
"loss": 0.7078,
"nll_loss": 0.06747711449861526,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.16695642471313477,
"rewards/margins": 0.18219462037086487,
"rewards/rejected": -0.34915101528167725,
"step": 510
},
{
"epoch": 0.04566109806159858,
"grad_norm": 4.252997398376465,
"learning_rate": 5.766666666666667e-06,
"logits/chosen": 3.5346503257751465,
"logits/rejected": 3.526895046234131,
"logps/chosen": -0.9578359723091125,
"logps/rejected": -1.617163896560669,
"loss": 0.7567,
"nll_loss": 0.11074657738208771,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.2873508036136627,
"rewards/margins": 0.19779837131500244,
"rewards/rejected": -0.4851491451263428,
"step": 520
},
{
"epoch": 0.046539196101244705,
"grad_norm": 10.661053657531738,
"learning_rate": 5.877777777777778e-06,
"logits/chosen": 3.5897374153137207,
"logits/rejected": 3.599020481109619,
"logps/chosen": -0.8656774759292603,
"logps/rejected": -1.2782808542251587,
"loss": 0.7624,
"nll_loss": 0.09282848984003067,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.25970324873924255,
"rewards/margins": 0.12378102540969849,
"rewards/rejected": -0.38348424434661865,
"step": 530
},
{
"epoch": 0.04741729414089083,
"grad_norm": 2.3466947078704834,
"learning_rate": 5.98888888888889e-06,
"logits/chosen": 3.41766619682312,
"logits/rejected": 3.4891743659973145,
"logps/chosen": -0.69093918800354,
"logps/rejected": -2.0744166374206543,
"loss": 0.6747,
"nll_loss": 0.0662418007850647,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.20728178322315216,
"rewards/margins": 0.41504326462745667,
"rewards/rejected": -0.6223250031471252,
"step": 540
},
{
"epoch": 0.04829539218053696,
"grad_norm": 4.315151214599609,
"learning_rate": 6.1e-06,
"logits/chosen": 3.4109902381896973,
"logits/rejected": 3.5040442943573,
"logps/chosen": -0.5460541248321533,
"logps/rejected": -1.9011294841766357,
"loss": 0.653,
"nll_loss": 0.06273610144853592,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.16381624341011047,
"rewards/margins": 0.40652260184288025,
"rewards/rejected": -0.5703388452529907,
"step": 550
},
{
"epoch": 0.04917349022018308,
"grad_norm": 4.520711898803711,
"learning_rate": 6.211111111111111e-06,
"logits/chosen": 3.562473773956299,
"logits/rejected": 3.6386642456054688,
"logps/chosen": -0.7312324047088623,
"logps/rejected": -1.6271352767944336,
"loss": 0.6966,
"nll_loss": 0.06128234788775444,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.21936972439289093,
"rewards/margins": 0.26877090334892273,
"rewards/rejected": -0.48814067244529724,
"step": 560
},
{
"epoch": 0.05005158825982921,
"grad_norm": 7.277171611785889,
"learning_rate": 6.322222222222223e-06,
"logits/chosen": 3.472989559173584,
"logits/rejected": 3.443589687347412,
"logps/chosen": -0.7062110900878906,
"logps/rejected": -2.1304116249084473,
"loss": 0.6869,
"nll_loss": 0.09942348301410675,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21186332404613495,
"rewards/margins": 0.42726022005081177,
"rewards/rejected": -0.6391235589981079,
"step": 570
},
{
"epoch": 0.050929686299475335,
"grad_norm": 7.543278217315674,
"learning_rate": 6.433333333333333e-06,
"logits/chosen": 3.464301347732544,
"logits/rejected": 3.444230556488037,
"logps/chosen": -0.7348255515098572,
"logps/rejected": -1.602164626121521,
"loss": 0.7316,
"nll_loss": 0.08772562444210052,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.22044768929481506,
"rewards/margins": 0.26020172238349915,
"rewards/rejected": -0.4806493818759918,
"step": 580
},
{
"epoch": 0.05180778433912146,
"grad_norm": 3.7779767513275146,
"learning_rate": 6.544444444444445e-06,
"logits/chosen": 3.633018970489502,
"logits/rejected": 3.709826946258545,
"logps/chosen": -0.9298914074897766,
"logps/rejected": -1.606702446937561,
"loss": 0.7953,
"nll_loss": 0.1196284145116806,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2789674401283264,
"rewards/margins": 0.20304329693317413,
"rewards/rejected": -0.48201069235801697,
"step": 590
},
{
"epoch": 0.05268588237876759,
"grad_norm": 7.008880615234375,
"learning_rate": 6.655555555555556e-06,
"logits/chosen": 3.381080150604248,
"logits/rejected": 3.444829225540161,
"logps/chosen": -0.7211336493492126,
"logps/rejected": -1.423513650894165,
"loss": 0.7429,
"nll_loss": 0.06740613281726837,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.216340109705925,
"rewards/margins": 0.21071402728557587,
"rewards/rejected": -0.42705410718917847,
"step": 600
},
{
"epoch": 0.05356398041841372,
"grad_norm": 5.524548530578613,
"learning_rate": 6.7666666666666665e-06,
"logits/chosen": 3.4077486991882324,
"logits/rejected": 3.377532958984375,
"logps/chosen": -0.5095429420471191,
"logps/rejected": -1.0147814750671387,
"loss": 0.7157,
"nll_loss": 0.0589555986225605,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.15286290645599365,
"rewards/margins": 0.1515716016292572,
"rewards/rejected": -0.30443447828292847,
"step": 610
},
{
"epoch": 0.054442078458059845,
"grad_norm": 4.042827606201172,
"learning_rate": 6.8777777777777785e-06,
"logits/chosen": 3.716031551361084,
"logits/rejected": 3.713074207305908,
"logps/chosen": -0.8152651786804199,
"logps/rejected": -1.5531432628631592,
"loss": 0.7318,
"nll_loss": 0.08608300983905792,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.24457958340644836,
"rewards/margins": 0.22136345505714417,
"rewards/rejected": -0.46594300866127014,
"step": 620
},
{
"epoch": 0.05532017649770597,
"grad_norm": 4.254418849945068,
"learning_rate": 6.9888888888888895e-06,
"logits/chosen": 3.4341864585876465,
"logits/rejected": 3.458519458770752,
"logps/chosen": -0.6925168037414551,
"logps/rejected": -1.5713815689086914,
"loss": 0.6898,
"nll_loss": 0.07025544345378876,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.20775504410266876,
"rewards/margins": 0.26365941762924194,
"rewards/rejected": -0.4714145064353943,
"step": 630
},
{
"epoch": 0.05619827453735209,
"grad_norm": 2.973627805709839,
"learning_rate": 7.100000000000001e-06,
"logits/chosen": 3.4850242137908936,
"logits/rejected": 3.5192267894744873,
"logps/chosen": -1.1339752674102783,
"logps/rejected": -1.5586907863616943,
"loss": 0.8059,
"nll_loss": 0.10102218389511108,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.34019264578819275,
"rewards/margins": 0.1274145543575287,
"rewards/rejected": -0.46760720014572144,
"step": 640
},
{
"epoch": 0.05707637257699822,
"grad_norm": 3.3906142711639404,
"learning_rate": 7.211111111111112e-06,
"logits/chosen": 3.6287121772766113,
"logits/rejected": 3.5881965160369873,
"logps/chosen": -0.7608178853988647,
"logps/rejected": -1.2269501686096191,
"loss": 0.7394,
"nll_loss": 0.0750693827867508,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.228245347738266,
"rewards/margins": 0.13983972370624542,
"rewards/rejected": -0.3680850863456726,
"step": 650
},
{
"epoch": 0.05795447061664435,
"grad_norm": 0.03068475052714348,
"learning_rate": 7.322222222222223e-06,
"logits/chosen": 3.743140697479248,
"logits/rejected": 3.7635676860809326,
"logps/chosen": -0.5632290840148926,
"logps/rejected": -1.5820039510726929,
"loss": 0.6807,
"nll_loss": 0.07005371153354645,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.16896870732307434,
"rewards/margins": 0.30563241243362427,
"rewards/rejected": -0.47460120916366577,
"step": 660
},
{
"epoch": 0.058832568656290475,
"grad_norm": 2.2352776527404785,
"learning_rate": 7.433333333333334e-06,
"logits/chosen": 3.5528149604797363,
"logits/rejected": 3.5446677207946777,
"logps/chosen": -0.5461568832397461,
"logps/rejected": -1.2885067462921143,
"loss": 0.6956,
"nll_loss": 0.06532245129346848,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.16384705901145935,
"rewards/margins": 0.22270497679710388,
"rewards/rejected": -0.3865520656108856,
"step": 670
},
{
"epoch": 0.0597106666959366,
"grad_norm": 2.6797573566436768,
"learning_rate": 7.544444444444445e-06,
"logits/chosen": 3.3902244567871094,
"logits/rejected": 3.4180169105529785,
"logps/chosen": -0.6432263255119324,
"logps/rejected": -1.1323941946029663,
"loss": 0.7574,
"nll_loss": 0.07451333105564117,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.19296793639659882,
"rewards/margins": 0.1467503160238266,
"rewards/rejected": -0.3397182822227478,
"step": 680
},
{
"epoch": 0.06058876473558273,
"grad_norm": 5.877313137054443,
"learning_rate": 7.655555555555556e-06,
"logits/chosen": 3.4120171070098877,
"logits/rejected": 3.4442646503448486,
"logps/chosen": -0.7911199331283569,
"logps/rejected": -1.4578664302825928,
"loss": 0.7751,
"nll_loss": 0.1019410640001297,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.23733600974082947,
"rewards/margins": 0.20002400875091553,
"rewards/rejected": -0.4373599886894226,
"step": 690
},
{
"epoch": 0.06146686277522886,
"grad_norm": 3.2915701866149902,
"learning_rate": 7.766666666666666e-06,
"logits/chosen": 3.5103302001953125,
"logits/rejected": 3.4995181560516357,
"logps/chosen": -0.6160825490951538,
"logps/rejected": -1.1019346714019775,
"loss": 0.7363,
"nll_loss": 0.06433330476284027,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.18482479453086853,
"rewards/margins": 0.14575564861297607,
"rewards/rejected": -0.3305804133415222,
"step": 700
},
{
"epoch": 0.06234496081487498,
"grad_norm": 3.7106151580810547,
"learning_rate": 7.877777777777778e-06,
"logits/chosen": 3.474386692047119,
"logits/rejected": 3.4553630352020264,
"logps/chosen": -0.6972242593765259,
"logps/rejected": -1.5925921201705933,
"loss": 0.7109,
"nll_loss": 0.07773466408252716,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.20916728675365448,
"rewards/margins": 0.2686103284358978,
"rewards/rejected": -0.4777776300907135,
"step": 710
},
{
"epoch": 0.0632230588545211,
"grad_norm": 1.7685184478759766,
"learning_rate": 7.98888888888889e-06,
"logits/chosen": 3.6326797008514404,
"logits/rejected": 3.625549793243408,
"logps/chosen": -0.5006519556045532,
"logps/rejected": -1.8575595617294312,
"loss": 0.6416,
"nll_loss": 0.05640099197626114,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.15019558370113373,
"rewards/margins": 0.4070723056793213,
"rewards/rejected": -0.5572679042816162,
"step": 720
},
{
"epoch": 0.06410115689416723,
"grad_norm": 4.595531463623047,
"learning_rate": 8.1e-06,
"logits/chosen": 3.6036553382873535,
"logits/rejected": 3.6813888549804688,
"logps/chosen": -1.0952359437942505,
"logps/rejected": -2.0565478801727295,
"loss": 0.7318,
"nll_loss": 0.06606093794107437,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.32857078313827515,
"rewards/margins": 0.28839364647865295,
"rewards/rejected": -0.6169643998146057,
"step": 730
},
{
"epoch": 0.06497925493381336,
"grad_norm": 2.7277488708496094,
"learning_rate": 8.211111111111112e-06,
"logits/chosen": 3.4828929901123047,
"logits/rejected": 3.536961317062378,
"logps/chosen": -0.6164706945419312,
"logps/rejected": -2.1122829914093018,
"loss": 0.6062,
"nll_loss": 0.040962688624858856,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.18494121730327606,
"rewards/margins": 0.44874364137649536,
"rewards/rejected": -0.6336848735809326,
"step": 740
},
{
"epoch": 0.06585735297345949,
"grad_norm": 3.6677157878875732,
"learning_rate": 8.322222222222223e-06,
"logits/chosen": 3.361325740814209,
"logits/rejected": 3.362967014312744,
"logps/chosen": -0.9366201162338257,
"logps/rejected": -1.7140756845474243,
"loss": 0.7507,
"nll_loss": 0.10440067946910858,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.28098607063293457,
"rewards/margins": 0.23323671519756317,
"rewards/rejected": -0.5142227411270142,
"step": 750
},
{
"epoch": 0.06673545101310562,
"grad_norm": 3.729750633239746,
"learning_rate": 8.433333333333334e-06,
"logits/chosen": 3.343749523162842,
"logits/rejected": 3.401230573654175,
"logps/chosen": -0.7000004649162292,
"logps/rejected": -1.9269546270370483,
"loss": 0.6931,
"nll_loss": 0.07972760498523712,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2100001573562622,
"rewards/margins": 0.3680862486362457,
"rewards/rejected": -0.5780864357948303,
"step": 760
},
{
"epoch": 0.06761354905275174,
"grad_norm": 7.166464328765869,
"learning_rate": 8.544444444444445e-06,
"logits/chosen": 3.4859509468078613,
"logits/rejected": 3.587602138519287,
"logps/chosen": -0.7638369798660278,
"logps/rejected": -2.2576065063476562,
"loss": 0.6832,
"nll_loss": 0.0738845020532608,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.22915109992027283,
"rewards/margins": 0.4481307864189148,
"rewards/rejected": -0.6772819757461548,
"step": 770
},
{
"epoch": 0.06849164709239787,
"grad_norm": 2.416330575942993,
"learning_rate": 8.655555555555557e-06,
"logits/chosen": 3.408698558807373,
"logits/rejected": 3.4245800971984863,
"logps/chosen": -0.8418534398078918,
"logps/rejected": -1.4894107580184937,
"loss": 0.757,
"nll_loss": 0.0903228372335434,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.25255605578422546,
"rewards/margins": 0.1942671835422516,
"rewards/rejected": -0.44682326912879944,
"step": 780
},
{
"epoch": 0.069369745132044,
"grad_norm": 4.281314373016357,
"learning_rate": 8.766666666666669e-06,
"logits/chosen": 3.2489428520202637,
"logits/rejected": 3.2514851093292236,
"logps/chosen": -0.8190711736679077,
"logps/rejected": -1.3487728834152222,
"loss": 0.7399,
"nll_loss": 0.07894166558980942,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.24572138488292694,
"rewards/margins": 0.15891052782535553,
"rewards/rejected": -0.40463191270828247,
"step": 790
},
{
"epoch": 0.07024784317169012,
"grad_norm": 1.3228946924209595,
"learning_rate": 8.877777777777779e-06,
"logits/chosen": 3.2964024543762207,
"logits/rejected": 3.3055152893066406,
"logps/chosen": -0.8143989443778992,
"logps/rejected": -1.2753360271453857,
"loss": 0.75,
"nll_loss": 0.07704529166221619,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.24431967735290527,
"rewards/margins": 0.13828110694885254,
"rewards/rejected": -0.3826007843017578,
"step": 800
},
{
"epoch": 0.07112594121133625,
"grad_norm": 3.7781331539154053,
"learning_rate": 8.988888888888889e-06,
"logits/chosen": 3.68397855758667,
"logits/rejected": 3.6694297790527344,
"logps/chosen": -0.9365280866622925,
"logps/rejected": -1.7311958074569702,
"loss": 0.7448,
"nll_loss": 0.08974708616733551,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.28095847368240356,
"rewards/margins": 0.23840029537677765,
"rewards/rejected": -0.51935875415802,
"step": 810
},
{
"epoch": 0.07200403925098237,
"grad_norm": 2.7228267192840576,
"learning_rate": 9.100000000000001e-06,
"logits/chosen": 3.6748175621032715,
"logits/rejected": 3.7467494010925293,
"logps/chosen": -0.5383256077766418,
"logps/rejected": -2.2832655906677246,
"loss": 0.6081,
"nll_loss": 0.06415946036577225,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.16149768233299255,
"rewards/margins": 0.523482084274292,
"rewards/rejected": -0.6849797964096069,
"step": 820
},
{
"epoch": 0.0728821372906285,
"grad_norm": 1.3846280574798584,
"learning_rate": 9.211111111111111e-06,
"logits/chosen": 3.2965283393859863,
"logits/rejected": 3.329348087310791,
"logps/chosen": -0.5321189761161804,
"logps/rejected": -2.2239809036254883,
"loss": 0.6186,
"nll_loss": 0.04651743918657303,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.15963570773601532,
"rewards/margins": 0.5075585842132568,
"rewards/rejected": -0.6671942472457886,
"step": 830
},
{
"epoch": 0.07376023533027462,
"grad_norm": 4.5536675453186035,
"learning_rate": 9.322222222222223e-06,
"logits/chosen": 3.347224473953247,
"logits/rejected": 3.3436226844787598,
"logps/chosen": -0.7508156895637512,
"logps/rejected": -1.867352843284607,
"loss": 0.707,
"nll_loss": 0.0711875781416893,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.2252446860074997,
"rewards/margins": 0.3349612355232239,
"rewards/rejected": -0.56020587682724,
"step": 840
},
{
"epoch": 0.07463833336992075,
"grad_norm": 4.691596508026123,
"learning_rate": 9.433333333333335e-06,
"logits/chosen": 3.253293991088867,
"logits/rejected": 3.2947421073913574,
"logps/chosen": -0.9249482154846191,
"logps/rejected": -1.812909483909607,
"loss": 0.7568,
"nll_loss": 0.09252104163169861,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.2774844765663147,
"rewards/margins": 0.2663884162902832,
"rewards/rejected": -0.5438728928565979,
"step": 850
},
{
"epoch": 0.07551643140956688,
"grad_norm": 4.119868278503418,
"learning_rate": 9.544444444444445e-06,
"logits/chosen": 3.3257553577423096,
"logits/rejected": 3.3328521251678467,
"logps/chosen": -0.8130607604980469,
"logps/rejected": -2.2179336547851562,
"loss": 0.7061,
"nll_loss": 0.09296734631061554,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.2439182549715042,
"rewards/margins": 0.42146188020706177,
"rewards/rejected": -0.6653801202774048,
"step": 860
},
{
"epoch": 0.076394529449213,
"grad_norm": 4.941491603851318,
"learning_rate": 9.655555555555556e-06,
"logits/chosen": 3.2392685413360596,
"logits/rejected": 3.2623963356018066,
"logps/chosen": -0.7361981272697449,
"logps/rejected": -1.6700445413589478,
"loss": 0.7312,
"nll_loss": 0.08274148404598236,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.22085945308208466,
"rewards/margins": 0.28015393018722534,
"rewards/rejected": -0.5010133385658264,
"step": 870
},
{
"epoch": 0.07727262748885913,
"grad_norm": 1.4032851457595825,
"learning_rate": 9.766666666666667e-06,
"logits/chosen": 3.2956814765930176,
"logits/rejected": 3.318169355392456,
"logps/chosen": -0.7056166529655457,
"logps/rejected": -1.8036190271377563,
"loss": 0.7136,
"nll_loss": 0.09712977707386017,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.21168498694896698,
"rewards/margins": 0.3294007182121277,
"rewards/rejected": -0.5410857200622559,
"step": 880
},
{
"epoch": 0.07815072552850526,
"grad_norm": 3.9460878372192383,
"learning_rate": 9.877777777777778e-06,
"logits/chosen": 3.4804458618164062,
"logits/rejected": 3.5131962299346924,
"logps/chosen": -0.9113373756408691,
"logps/rejected": -1.8960120677947998,
"loss": 0.7272,
"nll_loss": 0.08563139289617538,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.2734012007713318,
"rewards/margins": 0.295402467250824,
"rewards/rejected": -0.5688036680221558,
"step": 890
},
{
"epoch": 0.07902882356815139,
"grad_norm": 5.001852989196777,
"learning_rate": 9.98888888888889e-06,
"logits/chosen": 3.5693771839141846,
"logits/rejected": 3.613219738006592,
"logps/chosen": -0.7482819557189941,
"logps/rejected": -1.611090898513794,
"loss": 0.7488,
"nll_loss": 0.10075131803750992,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.2244846075773239,
"rewards/margins": 0.25884273648262024,
"rewards/rejected": -0.48332732915878296,
"step": 900
},
{
"epoch": 0.07990692160779751,
"grad_norm": 3.9788780212402344,
"learning_rate": 9.98888888888889e-06,
"logits/chosen": 3.418684720993042,
"logits/rejected": 3.4473800659179688,
"logps/chosen": -0.5167075991630554,
"logps/rejected": -1.0832570791244507,
"loss": 0.7187,
"nll_loss": 0.053985703736543655,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.155012309551239,
"rewards/margins": 0.16996484994888306,
"rewards/rejected": -0.3249771296977997,
"step": 910
},
{
"epoch": 0.08078501964744364,
"grad_norm": 1.5054136514663696,
"learning_rate": 9.976543209876544e-06,
"logits/chosen": 3.389498233795166,
"logits/rejected": 3.4252688884735107,
"logps/chosen": -0.6502631902694702,
"logps/rejected": -1.8272225856781006,
"loss": 0.6756,
"nll_loss": 0.07306591421365738,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1950789839029312,
"rewards/margins": 0.35308781266212463,
"rewards/rejected": -0.5481668710708618,
"step": 920
},
{
"epoch": 0.08166311768708977,
"grad_norm": 2.810540199279785,
"learning_rate": 9.964197530864198e-06,
"logits/chosen": 3.4912326335906982,
"logits/rejected": 3.503628969192505,
"logps/chosen": -0.4884684681892395,
"logps/rejected": -1.4588502645492554,
"loss": 0.6613,
"nll_loss": 0.04402286559343338,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14654052257537842,
"rewards/margins": 0.29111456871032715,
"rewards/rejected": -0.4376550614833832,
"step": 930
},
{
"epoch": 0.0825412157267359,
"grad_norm": 5.386466979980469,
"learning_rate": 9.951851851851853e-06,
"logits/chosen": 3.386685848236084,
"logits/rejected": 3.4002914428710938,
"logps/chosen": -0.5482162237167358,
"logps/rejected": -1.572486162185669,
"loss": 0.6616,
"nll_loss": 0.053943734616041183,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.16446486115455627,
"rewards/margins": 0.307280957698822,
"rewards/rejected": -0.4717458188533783,
"step": 940
},
{
"epoch": 0.08341931376638202,
"grad_norm": 1.676483154296875,
"learning_rate": 9.939506172839507e-06,
"logits/chosen": 3.3065590858459473,
"logits/rejected": 3.3416500091552734,
"logps/chosen": -0.5273550748825073,
"logps/rejected": -1.707932472229004,
"loss": 0.6729,
"nll_loss": 0.06074246019124985,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1582065373659134,
"rewards/margins": 0.3541732430458069,
"rewards/rejected": -0.5123798251152039,
"step": 950
},
{
"epoch": 0.08429741180602815,
"grad_norm": 2.689913272857666,
"learning_rate": 9.927160493827162e-06,
"logits/chosen": 3.2740864753723145,
"logits/rejected": 3.335360050201416,
"logps/chosen": -0.7466616630554199,
"logps/rejected": -2.0391454696655273,
"loss": 0.7055,
"nll_loss": 0.09172563254833221,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.2239985167980194,
"rewards/margins": 0.3877451419830322,
"rewards/rejected": -0.6117436289787292,
"step": 960
},
{
"epoch": 0.08517550984567426,
"grad_norm": 3.589853286743164,
"learning_rate": 9.914814814814816e-06,
"logits/chosen": 3.0097994804382324,
"logits/rejected": 3.0853917598724365,
"logps/chosen": -0.5722948312759399,
"logps/rejected": -1.9204362630844116,
"loss": 0.6501,
"nll_loss": 0.051983099430799484,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.17168846726417542,
"rewards/margins": 0.4044424593448639,
"rewards/rejected": -0.5761309266090393,
"step": 970
},
{
"epoch": 0.08605360788532039,
"grad_norm": 1.7398462295532227,
"learning_rate": 9.90246913580247e-06,
"logits/chosen": 3.3110098838806152,
"logits/rejected": 3.4121768474578857,
"logps/chosen": -0.6699460744857788,
"logps/rejected": -2.0870394706726074,
"loss": 0.6724,
"nll_loss": 0.053984154015779495,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.20098385214805603,
"rewards/margins": 0.4251279830932617,
"rewards/rejected": -0.6261118054389954,
"step": 980
},
{
"epoch": 0.08693170592496652,
"grad_norm": 5.3601861000061035,
"learning_rate": 9.890123456790123e-06,
"logits/chosen": 3.1401820182800293,
"logits/rejected": 3.127436399459839,
"logps/chosen": -0.7694223523139954,
"logps/rejected": -1.6481168270111084,
"loss": 0.737,
"nll_loss": 0.09365083277225494,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.230826735496521,
"rewards/margins": 0.2636083662509918,
"rewards/rejected": -0.49443507194519043,
"step": 990
},
{
"epoch": 0.08780980396461265,
"grad_norm": 2.6405630111694336,
"learning_rate": 9.877777777777778e-06,
"logits/chosen": 3.1894805431365967,
"logits/rejected": 3.2360007762908936,
"logps/chosen": -0.5739088654518127,
"logps/rejected": -2.0212159156799316,
"loss": 0.6616,
"nll_loss": 0.05945644527673721,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1721726506948471,
"rewards/margins": 0.4341921806335449,
"rewards/rejected": -0.606364905834198,
"step": 1000
},
{
"epoch": 0.08868790200425877,
"grad_norm": 2.2846076488494873,
"learning_rate": 9.865432098765432e-06,
"logits/chosen": 3.2782859802246094,
"logits/rejected": 3.2814033031463623,
"logps/chosen": -0.5302027463912964,
"logps/rejected": -1.8145701885223389,
"loss": 0.6543,
"nll_loss": 0.05663750320672989,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.15906082093715668,
"rewards/margins": 0.3853102922439575,
"rewards/rejected": -0.5443711280822754,
"step": 1010
},
{
"epoch": 0.0895660000439049,
"grad_norm": 13.04986572265625,
"learning_rate": 9.853086419753087e-06,
"logits/chosen": 2.8904287815093994,
"logits/rejected": 2.907032012939453,
"logps/chosen": -1.1254570484161377,
"logps/rejected": -2.3068795204162598,
"loss": 0.7832,
"nll_loss": 0.1419171392917633,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.33763712644577026,
"rewards/margins": 0.354426771402359,
"rewards/rejected": -0.6920639276504517,
"step": 1020
},
{
"epoch": 0.09044409808355103,
"grad_norm": 2.556560754776001,
"learning_rate": 9.840740740740743e-06,
"logits/chosen": 3.151669502258301,
"logits/rejected": 3.1690382957458496,
"logps/chosen": -0.7756383419036865,
"logps/rejected": -1.6224708557128906,
"loss": 0.7551,
"nll_loss": 0.09495635330677032,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.23269149661064148,
"rewards/margins": 0.25404977798461914,
"rewards/rejected": -0.486741304397583,
"step": 1030
},
{
"epoch": 0.09132219612319716,
"grad_norm": 1.3183997869491577,
"learning_rate": 9.828395061728397e-06,
"logits/chosen": 3.195861339569092,
"logits/rejected": 3.2810165882110596,
"logps/chosen": -0.5329464077949524,
"logps/rejected": -1.2915513515472412,
"loss": 0.7152,
"nll_loss": 0.054315369576215744,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.15988394618034363,
"rewards/margins": 0.22758150100708008,
"rewards/rejected": -0.3874654471874237,
"step": 1040
},
{
"epoch": 0.09220029416284328,
"grad_norm": 1.674479603767395,
"learning_rate": 9.81604938271605e-06,
"logits/chosen": 3.139688491821289,
"logits/rejected": 3.2257683277130127,
"logps/chosen": -0.7113819122314453,
"logps/rejected": -2.1473631858825684,
"loss": 0.679,
"nll_loss": 0.04762103408575058,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21341457962989807,
"rewards/margins": 0.4307943284511566,
"rewards/rejected": -0.6442088484764099,
"step": 1050
},
{
"epoch": 0.09307839220248941,
"grad_norm": 1.9272900819778442,
"learning_rate": 9.803703703703704e-06,
"logits/chosen": 3.403465986251831,
"logits/rejected": 3.385577440261841,
"logps/chosen": -0.9791383743286133,
"logps/rejected": -2.0091757774353027,
"loss": 0.7499,
"nll_loss": 0.11029829829931259,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.29374146461486816,
"rewards/margins": 0.309011310338974,
"rewards/rejected": -0.6027528047561646,
"step": 1060
},
{
"epoch": 0.09395649024213554,
"grad_norm": 3.530672073364258,
"learning_rate": 9.791358024691359e-06,
"logits/chosen": 2.975001573562622,
"logits/rejected": 3.063398838043213,
"logps/chosen": -0.9404687881469727,
"logps/rejected": -1.803180456161499,
"loss": 0.7547,
"nll_loss": 0.079728864133358,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.28214067220687866,
"rewards/margins": 0.2588135600090027,
"rewards/rejected": -0.5409542322158813,
"step": 1070
},
{
"epoch": 0.09483458828178166,
"grad_norm": 7.528195381164551,
"learning_rate": 9.779012345679013e-06,
"logits/chosen": 3.1206297874450684,
"logits/rejected": 3.173870086669922,
"logps/chosen": -0.6394155621528625,
"logps/rejected": -1.1264150142669678,
"loss": 0.7336,
"nll_loss": 0.06591827422380447,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.19182467460632324,
"rewards/margins": 0.14609983563423157,
"rewards/rejected": -0.3379245400428772,
"step": 1080
},
{
"epoch": 0.09571268632142779,
"grad_norm": 2.3711354732513428,
"learning_rate": 9.766666666666667e-06,
"logits/chosen": 3.2346444129943848,
"logits/rejected": 3.3024227619171143,
"logps/chosen": -0.7532138824462891,
"logps/rejected": -1.9977819919586182,
"loss": 0.6926,
"nll_loss": 0.0733107179403305,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.22596418857574463,
"rewards/margins": 0.3733704090118408,
"rewards/rejected": -0.5993345975875854,
"step": 1090
},
{
"epoch": 0.09659078436107392,
"grad_norm": 2.3100759983062744,
"learning_rate": 9.754320987654322e-06,
"logits/chosen": 3.0819344520568848,
"logits/rejected": 3.081664562225342,
"logps/chosen": -0.39054492115974426,
"logps/rejected": -1.3339643478393555,
"loss": 0.6571,
"nll_loss": 0.03647618740797043,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.11716349422931671,
"rewards/margins": 0.2830258309841156,
"rewards/rejected": -0.4001893401145935,
"step": 1100
},
{
"epoch": 0.09746888240072005,
"grad_norm": 4.387854099273682,
"learning_rate": 9.741975308641976e-06,
"logits/chosen": 3.0971732139587402,
"logits/rejected": 3.105783224105835,
"logps/chosen": -0.6461865305900574,
"logps/rejected": -1.5687153339385986,
"loss": 0.7209,
"nll_loss": 0.08219017088413239,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.19385597109794617,
"rewards/margins": 0.2767586410045624,
"rewards/rejected": -0.47061461210250854,
"step": 1110
},
{
"epoch": 0.09834698044036616,
"grad_norm": 1.8574669361114502,
"learning_rate": 9.72962962962963e-06,
"logits/chosen": 3.2028121948242188,
"logits/rejected": 3.188230037689209,
"logps/chosen": -0.6974012851715088,
"logps/rejected": -2.043318033218384,
"loss": 0.6628,
"nll_loss": 0.06341539323329926,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.20922040939331055,
"rewards/margins": 0.40377503633499146,
"rewards/rejected": -0.612995445728302,
"step": 1120
},
{
"epoch": 0.09922507848001229,
"grad_norm": 1.4223850965499878,
"learning_rate": 9.717283950617285e-06,
"logits/chosen": 3.1057040691375732,
"logits/rejected": 3.1665711402893066,
"logps/chosen": -0.4518910348415375,
"logps/rejected": -1.782041311264038,
"loss": 0.6529,
"nll_loss": 0.04685738682746887,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1355672925710678,
"rewards/margins": 0.39904507994651794,
"rewards/rejected": -0.5346124172210693,
"step": 1130
},
{
"epoch": 0.10010317651965842,
"grad_norm": 0.6512376666069031,
"learning_rate": 9.70493827160494e-06,
"logits/chosen": 3.062418222427368,
"logits/rejected": 3.1085588932037354,
"logps/chosen": -0.7759238481521606,
"logps/rejected": -2.237229585647583,
"loss": 0.6829,
"nll_loss": 0.08105801045894623,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.2327771931886673,
"rewards/margins": 0.43839168548583984,
"rewards/rejected": -0.6711689233779907,
"step": 1140
},
{
"epoch": 0.10098127455930454,
"grad_norm": 2.3270082473754883,
"learning_rate": 9.692592592592594e-06,
"logits/chosen": 2.8819382190704346,
"logits/rejected": 2.9401650428771973,
"logps/chosen": -0.5062090754508972,
"logps/rejected": -1.868971824645996,
"loss": 0.6794,
"nll_loss": 0.05651511624455452,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1518627107143402,
"rewards/margins": 0.40882882475852966,
"rewards/rejected": -0.5606915354728699,
"step": 1150
},
{
"epoch": 0.10185937259895067,
"grad_norm": 1.2098430395126343,
"learning_rate": 9.680246913580248e-06,
"logits/chosen": 2.7454497814178467,
"logits/rejected": 2.780897617340088,
"logps/chosen": -0.9837905168533325,
"logps/rejected": -2.0501391887664795,
"loss": 0.7538,
"nll_loss": 0.10150198638439178,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.2951371669769287,
"rewards/margins": 0.319904625415802,
"rewards/rejected": -0.6150418519973755,
"step": 1160
},
{
"epoch": 0.1027374706385968,
"grad_norm": 2.469829559326172,
"learning_rate": 9.667901234567903e-06,
"logits/chosen": 2.8060965538024902,
"logits/rejected": 2.8710038661956787,
"logps/chosen": -0.6813799738883972,
"logps/rejected": -2.1527724266052246,
"loss": 0.6474,
"nll_loss": 0.06949031352996826,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2044139802455902,
"rewards/margins": 0.44141775369644165,
"rewards/rejected": -0.6458317637443542,
"step": 1170
},
{
"epoch": 0.10361556867824293,
"grad_norm": 3.5877394676208496,
"learning_rate": 9.655555555555556e-06,
"logits/chosen": 3.1048672199249268,
"logits/rejected": 3.1411147117614746,
"logps/chosen": -0.4382111132144928,
"logps/rejected": -2.12353253364563,
"loss": 0.624,
"nll_loss": 0.048050910234451294,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13146333396434784,
"rewards/margins": 0.5055964589118958,
"rewards/rejected": -0.6370598077774048,
"step": 1180
},
{
"epoch": 0.10449366671788905,
"grad_norm": 13.380763053894043,
"learning_rate": 9.64320987654321e-06,
"logits/chosen": 2.6727805137634277,
"logits/rejected": 2.7236101627349854,
"logps/chosen": -0.7069253325462341,
"logps/rejected": -2.2335355281829834,
"loss": 0.6386,
"nll_loss": 0.05090578272938728,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.21207761764526367,
"rewards/margins": 0.45798301696777344,
"rewards/rejected": -0.6700606346130371,
"step": 1190
},
{
"epoch": 0.10537176475753518,
"grad_norm": 7.120882034301758,
"learning_rate": 9.630864197530864e-06,
"logits/chosen": 2.8762526512145996,
"logits/rejected": 2.901745319366455,
"logps/chosen": -0.7335812449455261,
"logps/rejected": -2.8476223945617676,
"loss": 0.6497,
"nll_loss": 0.07983629405498505,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2200743705034256,
"rewards/margins": 0.6342123746871948,
"rewards/rejected": -0.854286789894104,
"step": 1200
},
{
"epoch": 0.10624986279718131,
"grad_norm": 5.2290263175964355,
"learning_rate": 9.618518518518519e-06,
"logits/chosen": 3.0229249000549316,
"logits/rejected": 2.959900379180908,
"logps/chosen": -1.3016353845596313,
"logps/rejected": -1.7729085683822632,
"loss": 0.8886,
"nll_loss": 0.14201593399047852,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.3904905915260315,
"rewards/margins": 0.14138197898864746,
"rewards/rejected": -0.5318726301193237,
"step": 1210
},
{
"epoch": 0.10712796083682743,
"grad_norm": 1.350420355796814,
"learning_rate": 9.606172839506173e-06,
"logits/chosen": 2.695782423019409,
"logits/rejected": 2.6982076168060303,
"logps/chosen": -0.5528481602668762,
"logps/rejected": -1.7437477111816406,
"loss": 0.7039,
"nll_loss": 0.06110968068242073,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.16585442423820496,
"rewards/margins": 0.3572699725627899,
"rewards/rejected": -0.5231243371963501,
"step": 1220
},
{
"epoch": 0.10800605887647356,
"grad_norm": 4.9116291999816895,
"learning_rate": 9.593827160493828e-06,
"logits/chosen": 2.8747756481170654,
"logits/rejected": 2.797008514404297,
"logps/chosen": -0.7696909308433533,
"logps/rejected": -1.8576171398162842,
"loss": 0.7164,
"nll_loss": 0.08876083791255951,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.23090729117393494,
"rewards/margins": 0.3263779282569885,
"rewards/rejected": -0.5572851896286011,
"step": 1230
},
{
"epoch": 0.10888415691611969,
"grad_norm": 3.6960697174072266,
"learning_rate": 9.581481481481482e-06,
"logits/chosen": 3.121314287185669,
"logits/rejected": 3.1768624782562256,
"logps/chosen": -0.597256064414978,
"logps/rejected": -2.1361727714538574,
"loss": 0.6718,
"nll_loss": 0.06965653598308563,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.17917683720588684,
"rewards/margins": 0.4616750180721283,
"rewards/rejected": -0.6408518552780151,
"step": 1240
},
{
"epoch": 0.10976225495576582,
"grad_norm": 1.3148126602172852,
"learning_rate": 9.569135802469136e-06,
"logits/chosen": 2.9931716918945312,
"logits/rejected": 3.0196568965911865,
"logps/chosen": -0.8401254415512085,
"logps/rejected": -2.2920265197753906,
"loss": 0.7173,
"nll_loss": 0.08757736533880234,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2520376443862915,
"rewards/margins": 0.4355703294277191,
"rewards/rejected": -0.6876079440116882,
"step": 1250
},
{
"epoch": 0.11064035299541194,
"grad_norm": 3.798100471496582,
"learning_rate": 9.556790123456791e-06,
"logits/chosen": 2.9449126720428467,
"logits/rejected": 2.9711549282073975,
"logps/chosen": -0.6954627633094788,
"logps/rejected": -1.5479028224945068,
"loss": 0.7229,
"nll_loss": 0.06693422794342041,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.20863886177539825,
"rewards/margins": 0.2557320296764374,
"rewards/rejected": -0.4643709063529968,
"step": 1260
},
{
"epoch": 0.11151845103505806,
"grad_norm": 2.8658065795898438,
"learning_rate": 9.544444444444445e-06,
"logits/chosen": 3.015810966491699,
"logits/rejected": 3.0757055282592773,
"logps/chosen": -1.2240221500396729,
"logps/rejected": -1.8749526739120483,
"loss": 0.8117,
"nll_loss": 0.10025894641876221,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.3672066628932953,
"rewards/margins": 0.1952790915966034,
"rewards/rejected": -0.5624858140945435,
"step": 1270
},
{
"epoch": 0.11239654907470419,
"grad_norm": 2.584207534790039,
"learning_rate": 9.5320987654321e-06,
"logits/chosen": 3.0405101776123047,
"logits/rejected": 3.0836472511291504,
"logps/chosen": -0.5894891619682312,
"logps/rejected": -1.7727954387664795,
"loss": 0.6706,
"nll_loss": 0.057862233370542526,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.17684674263000488,
"rewards/margins": 0.35499197244644165,
"rewards/rejected": -0.5318387150764465,
"step": 1280
},
{
"epoch": 0.11327464711435031,
"grad_norm": 0.8380700945854187,
"learning_rate": 9.519753086419754e-06,
"logits/chosen": 3.015899896621704,
"logits/rejected": 2.9938347339630127,
"logps/chosen": -0.48675793409347534,
"logps/rejected": -1.6840702295303345,
"loss": 0.6618,
"nll_loss": 0.06300728023052216,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.14602738618850708,
"rewards/margins": 0.35919371247291565,
"rewards/rejected": -0.5052211880683899,
"step": 1290
},
{
"epoch": 0.11415274515399644,
"grad_norm": 1.4230750799179077,
"learning_rate": 9.507407407407409e-06,
"logits/chosen": 3.2599899768829346,
"logits/rejected": 3.2479500770568848,
"logps/chosen": -0.5513351559638977,
"logps/rejected": -1.5504658222198486,
"loss": 0.7042,
"nll_loss": 0.05995137244462967,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.16540054976940155,
"rewards/margins": 0.299739271402359,
"rewards/rejected": -0.46513980627059937,
"step": 1300
},
{
"epoch": 0.11503084319364257,
"grad_norm": 1.0455182790756226,
"learning_rate": 9.495061728395063e-06,
"logits/chosen": 3.0312843322753906,
"logits/rejected": 3.068418025970459,
"logps/chosen": -0.5078593492507935,
"logps/rejected": -2.3526813983917236,
"loss": 0.6117,
"nll_loss": 0.04707217961549759,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.152357816696167,
"rewards/margins": 0.5534465909004211,
"rewards/rejected": -0.7058044672012329,
"step": 1310
},
{
"epoch": 0.1159089412332887,
"grad_norm": 3.6651833057403564,
"learning_rate": 9.482716049382716e-06,
"logits/chosen": 3.071345806121826,
"logits/rejected": 3.0848429203033447,
"logps/chosen": -0.6071802973747253,
"logps/rejected": -1.6902376413345337,
"loss": 0.6877,
"nll_loss": 0.06448554247617722,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1821540892124176,
"rewards/margins": 0.324917197227478,
"rewards/rejected": -0.507071316242218,
"step": 1320
},
{
"epoch": 0.11678703927293482,
"grad_norm": 5.458474159240723,
"learning_rate": 9.47037037037037e-06,
"logits/chosen": 2.9955544471740723,
"logits/rejected": 2.9541006088256836,
"logps/chosen": -1.1582845449447632,
"logps/rejected": -2.687746047973633,
"loss": 0.8277,
"nll_loss": 0.14388300478458405,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.34748542308807373,
"rewards/margins": 0.45883846282958984,
"rewards/rejected": -0.8063238859176636,
"step": 1330
},
{
"epoch": 0.11766513731258095,
"grad_norm": 1.8496476411819458,
"learning_rate": 9.458024691358025e-06,
"logits/chosen": 2.8806967735290527,
"logits/rejected": 2.922480344772339,
"logps/chosen": -0.3559941351413727,
"logps/rejected": -2.068563938140869,
"loss": 0.5969,
"nll_loss": 0.03910910710692406,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.10679824650287628,
"rewards/margins": 0.5137708783149719,
"rewards/rejected": -0.6205691695213318,
"step": 1340
},
{
"epoch": 0.11854323535222708,
"grad_norm": 8.205083847045898,
"learning_rate": 9.44567901234568e-06,
"logits/chosen": 2.9301178455352783,
"logits/rejected": 2.972548246383667,
"logps/chosen": -0.6576313972473145,
"logps/rejected": -1.3083826303482056,
"loss": 0.7648,
"nll_loss": 0.0889785960316658,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.19728945195674896,
"rewards/margins": 0.19522538781166077,
"rewards/rejected": -0.39251479506492615,
"step": 1350
},
{
"epoch": 0.1194213333918732,
"grad_norm": 3.6549758911132812,
"learning_rate": 9.433333333333335e-06,
"logits/chosen": 3.0893311500549316,
"logits/rejected": 3.068948745727539,
"logps/chosen": -0.6783910989761353,
"logps/rejected": -2.4969258308410645,
"loss": 0.6481,
"nll_loss": 0.06912653148174286,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.203517347574234,
"rewards/margins": 0.545560359954834,
"rewards/rejected": -0.7490777969360352,
"step": 1360
},
{
"epoch": 0.12029943143151933,
"grad_norm": 1.5456334352493286,
"learning_rate": 9.42098765432099e-06,
"logits/chosen": 2.9339780807495117,
"logits/rejected": 2.9962122440338135,
"logps/chosen": -0.46310439705848694,
"logps/rejected": -2.436547040939331,
"loss": 0.6248,
"nll_loss": 0.06015176698565483,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.13893131911754608,
"rewards/margins": 0.5920329093933105,
"rewards/rejected": -0.730964183807373,
"step": 1370
},
{
"epoch": 0.12117752947116546,
"grad_norm": 2.95572829246521,
"learning_rate": 9.408641975308642e-06,
"logits/chosen": 2.7811484336853027,
"logits/rejected": 2.875540018081665,
"logps/chosen": -0.6040056347846985,
"logps/rejected": -2.7113006114959717,
"loss": 0.6342,
"nll_loss": 0.042366378009319305,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.18120168149471283,
"rewards/margins": 0.6321884393692017,
"rewards/rejected": -0.8133901357650757,
"step": 1380
},
{
"epoch": 0.12205562751081159,
"grad_norm": 5.596220970153809,
"learning_rate": 9.396296296296297e-06,
"logits/chosen": 2.9219090938568115,
"logits/rejected": 2.9703197479248047,
"logps/chosen": -1.1313722133636475,
"logps/rejected": -3.376211166381836,
"loss": 0.6689,
"nll_loss": 0.11231324821710587,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.3394116759300232,
"rewards/margins": 0.6734517216682434,
"rewards/rejected": -1.0128633975982666,
"step": 1390
},
{
"epoch": 0.12293372555045771,
"grad_norm": 4.154329776763916,
"learning_rate": 9.383950617283951e-06,
"logits/chosen": 2.875544548034668,
"logits/rejected": 2.909510374069214,
"logps/chosen": -0.7795349955558777,
"logps/rejected": -3.0624213218688965,
"loss": 0.6413,
"nll_loss": 0.09788934886455536,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23386052250862122,
"rewards/margins": 0.6848658919334412,
"rewards/rejected": -0.9187263250350952,
"step": 1400
},
{
"epoch": 0.12381182359010383,
"grad_norm": 0.9190550446510315,
"learning_rate": 9.371604938271605e-06,
"logits/chosen": 2.800968885421753,
"logits/rejected": 2.7952873706817627,
"logps/chosen": -0.5842548608779907,
"logps/rejected": -2.0285487174987793,
"loss": 0.6672,
"nll_loss": 0.06469441950321198,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.17527645826339722,
"rewards/margins": 0.43328824639320374,
"rewards/rejected": -0.6085646748542786,
"step": 1410
},
{
"epoch": 0.12468992162974996,
"grad_norm": 0.4985824525356293,
"learning_rate": 9.35925925925926e-06,
"logits/chosen": 2.738670825958252,
"logits/rejected": 2.7031972408294678,
"logps/chosen": -0.714606761932373,
"logps/rejected": -1.1444988250732422,
"loss": 0.7813,
"nll_loss": 0.07031063735485077,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.21438205242156982,
"rewards/margins": 0.12896756827831268,
"rewards/rejected": -0.3433496356010437,
"step": 1420
},
{
"epoch": 0.1255680196693961,
"grad_norm": 3.7305266857147217,
"learning_rate": 9.346913580246914e-06,
"logits/chosen": 2.893129825592041,
"logits/rejected": 2.9149386882781982,
"logps/chosen": -0.46482163667678833,
"logps/rejected": -2.2275373935699463,
"loss": 0.6239,
"nll_loss": 0.051440030336380005,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.13944648206233978,
"rewards/margins": 0.5288147926330566,
"rewards/rejected": -0.6682612299919128,
"step": 1430
},
{
"epoch": 0.1264461177090422,
"grad_norm": 1.8036631345748901,
"learning_rate": 9.334567901234569e-06,
"logits/chosen": 2.547828197479248,
"logits/rejected": 2.5787394046783447,
"logps/chosen": -0.6464110612869263,
"logps/rejected": -2.2243905067443848,
"loss": 0.6899,
"nll_loss": 0.06981117278337479,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.19392332434654236,
"rewards/margins": 0.47339382767677307,
"rewards/rejected": -0.6673170924186707,
"step": 1440
},
{
"epoch": 0.12732421574868835,
"grad_norm": 1.5003271102905273,
"learning_rate": 9.322222222222223e-06,
"logits/chosen": 2.7942347526550293,
"logits/rejected": 2.790160655975342,
"logps/chosen": -0.45981842279434204,
"logps/rejected": -1.9567676782608032,
"loss": 0.6536,
"nll_loss": 0.0419035442173481,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1379455327987671,
"rewards/margins": 0.4490847587585449,
"rewards/rejected": -0.587030291557312,
"step": 1450
},
{
"epoch": 0.12820231378833447,
"grad_norm": 3.1813344955444336,
"learning_rate": 9.309876543209878e-06,
"logits/chosen": 2.6398301124572754,
"logits/rejected": 2.6550662517547607,
"logps/chosen": -0.6792389154434204,
"logps/rejected": -2.9893813133239746,
"loss": 0.6317,
"nll_loss": 0.08011049032211304,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2037716805934906,
"rewards/margins": 0.6930428147315979,
"rewards/rejected": -0.8968144655227661,
"step": 1460
},
{
"epoch": 0.1290804118279806,
"grad_norm": 6.025292873382568,
"learning_rate": 9.297530864197532e-06,
"logits/chosen": 2.5487821102142334,
"logits/rejected": 2.6483190059661865,
"logps/chosen": -0.3606962561607361,
"logps/rejected": -2.5816264152526855,
"loss": 0.6086,
"nll_loss": 0.04057370498776436,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.10820887982845306,
"rewards/margins": 0.6662790179252625,
"rewards/rejected": -0.7744879126548767,
"step": 1470
},
{
"epoch": 0.12995850986762672,
"grad_norm": 31.56429100036621,
"learning_rate": 9.285185185185186e-06,
"logits/chosen": 2.4623022079467773,
"logits/rejected": 2.409700393676758,
"logps/chosen": -0.9133744239807129,
"logps/rejected": -3.7820403575897217,
"loss": 0.8332,
"nll_loss": 0.23980839550495148,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.27401235699653625,
"rewards/margins": 0.8605998158454895,
"rewards/rejected": -1.1346122026443481,
"step": 1480
},
{
"epoch": 0.13083660790727283,
"grad_norm": 0.035230621695518494,
"learning_rate": 9.27283950617284e-06,
"logits/chosen": 2.52907133102417,
"logits/rejected": 2.530890464782715,
"logps/chosen": -1.2194797992706299,
"logps/rejected": -2.8537631034851074,
"loss": 0.7781,
"nll_loss": 0.14254291355609894,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.36584392189979553,
"rewards/margins": 0.49028509855270386,
"rewards/rejected": -0.856128990650177,
"step": 1490
},
{
"epoch": 0.13171470594691898,
"grad_norm": 6.077812671661377,
"learning_rate": 9.260493827160495e-06,
"logits/chosen": 2.5622482299804688,
"logits/rejected": 2.607653856277466,
"logps/chosen": -0.8888137936592102,
"logps/rejected": -2.8897366523742676,
"loss": 0.6866,
"nll_loss": 0.07926015555858612,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2666441798210144,
"rewards/margins": 0.6002769470214844,
"rewards/rejected": -0.8669211268424988,
"step": 1500
},
{
"epoch": 0.1325928039865651,
"grad_norm": 3.651186466217041,
"learning_rate": 9.24814814814815e-06,
"logits/chosen": 2.693009853363037,
"logits/rejected": 2.6718087196350098,
"logps/chosen": -0.8477522134780884,
"logps/rejected": -1.7480132579803467,
"loss": 0.7543,
"nll_loss": 0.09234277904033661,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.25432562828063965,
"rewards/margins": 0.2700783610343933,
"rewards/rejected": -0.5244040489196777,
"step": 1510
},
{
"epoch": 0.13347090202621123,
"grad_norm": 2.8306994438171387,
"learning_rate": 9.235802469135802e-06,
"logits/chosen": 2.8524880409240723,
"logits/rejected": 2.8818418979644775,
"logps/chosen": -0.49177321791648865,
"logps/rejected": -1.9612071514129639,
"loss": 0.6686,
"nll_loss": 0.08659791201353073,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.14753195643424988,
"rewards/margins": 0.4408302307128906,
"rewards/rejected": -0.5883622169494629,
"step": 1520
},
{
"epoch": 0.13434900006585734,
"grad_norm": 3.763047456741333,
"learning_rate": 9.223456790123457e-06,
"logits/chosen": 2.698072671890259,
"logits/rejected": 2.754232883453369,
"logps/chosen": -0.798735499382019,
"logps/rejected": -2.2809195518493652,
"loss": 0.6913,
"nll_loss": 0.07482419162988663,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2396206557750702,
"rewards/margins": 0.44465526938438416,
"rewards/rejected": -0.6842759251594543,
"step": 1530
},
{
"epoch": 0.13522709810550348,
"grad_norm": 3.579263925552368,
"learning_rate": 9.211111111111111e-06,
"logits/chosen": 3.0055601596832275,
"logits/rejected": 3.015162706375122,
"logps/chosen": -0.6990305185317993,
"logps/rejected": -1.879294991493225,
"loss": 0.6897,
"nll_loss": 0.06048674136400223,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.20970916748046875,
"rewards/margins": 0.35407939553260803,
"rewards/rejected": -0.5637885332107544,
"step": 1540
},
{
"epoch": 0.1361051961451496,
"grad_norm": 1.4635862112045288,
"learning_rate": 9.198765432098766e-06,
"logits/chosen": 2.665912628173828,
"logits/rejected": 2.7371814250946045,
"logps/chosen": -0.644806981086731,
"logps/rejected": -2.0770316123962402,
"loss": 0.6949,
"nll_loss": 0.07454844564199448,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.19344215095043182,
"rewards/margins": 0.4296673834323883,
"rewards/rejected": -0.6231095194816589,
"step": 1550
},
{
"epoch": 0.13698329418479574,
"grad_norm": 3.593688726425171,
"learning_rate": 9.18641975308642e-06,
"logits/chosen": 3.0721404552459717,
"logits/rejected": 3.028421401977539,
"logps/chosen": -0.8886274099349976,
"logps/rejected": -2.1320443153381348,
"loss": 0.7047,
"nll_loss": 0.07659469544887543,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.2665882408618927,
"rewards/margins": 0.3730250298976898,
"rewards/rejected": -0.6396132707595825,
"step": 1560
},
{
"epoch": 0.13786139222444185,
"grad_norm": 6.7611470222473145,
"learning_rate": 9.174074074074074e-06,
"logits/chosen": 2.952538013458252,
"logits/rejected": 2.9351909160614014,
"logps/chosen": -0.5501828193664551,
"logps/rejected": -2.4539520740509033,
"loss": 0.6445,
"nll_loss": 0.07492227852344513,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.16505484282970428,
"rewards/margins": 0.5711307525634766,
"rewards/rejected": -0.7361854910850525,
"step": 1570
},
{
"epoch": 0.138739490264088,
"grad_norm": 0.00917895883321762,
"learning_rate": 9.161728395061729e-06,
"logits/chosen": 2.942192792892456,
"logits/rejected": 2.9979898929595947,
"logps/chosen": -0.4612821638584137,
"logps/rejected": -1.4370297193527222,
"loss": 0.6585,
"nll_loss": 0.04147753119468689,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13838467001914978,
"rewards/margins": 0.2927243113517761,
"rewards/rejected": -0.43110889196395874,
"step": 1580
},
{
"epoch": 0.1396175883037341,
"grad_norm": 1.7900971174240112,
"learning_rate": 9.149382716049383e-06,
"logits/chosen": 2.7242748737335205,
"logits/rejected": 2.7570395469665527,
"logps/chosen": -0.5272113084793091,
"logps/rejected": -1.8239033222198486,
"loss": 0.6752,
"nll_loss": 0.07194850593805313,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1581634134054184,
"rewards/margins": 0.3890075981616974,
"rewards/rejected": -0.5471709966659546,
"step": 1590
},
{
"epoch": 0.14049568634338025,
"grad_norm": 7.255580902099609,
"learning_rate": 9.137037037037038e-06,
"logits/chosen": 2.852332592010498,
"logits/rejected": 2.8654465675354004,
"logps/chosen": -0.8652445077896118,
"logps/rejected": -2.7340779304504395,
"loss": 0.7026,
"nll_loss": 0.08957532793283463,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.25957340002059937,
"rewards/margins": 0.5606500506401062,
"rewards/rejected": -0.8202234506607056,
"step": 1600
},
{
"epoch": 0.14137378438302636,
"grad_norm": 0.27785807847976685,
"learning_rate": 9.124691358024692e-06,
"logits/chosen": 2.8517799377441406,
"logits/rejected": 2.871009111404419,
"logps/chosen": -0.45925140380859375,
"logps/rejected": -1.38749098777771,
"loss": 0.6784,
"nll_loss": 0.046599697321653366,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.13777543604373932,
"rewards/margins": 0.2784718871116638,
"rewards/rejected": -0.41624727845191956,
"step": 1610
},
{
"epoch": 0.1422518824226725,
"grad_norm": 2.3490424156188965,
"learning_rate": 9.112345679012347e-06,
"logits/chosen": 3.001574754714966,
"logits/rejected": 2.9703142642974854,
"logps/chosen": -0.4932584762573242,
"logps/rejected": -2.1252918243408203,
"loss": 0.6813,
"nll_loss": 0.08429791033267975,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1479775607585907,
"rewards/margins": 0.48961010575294495,
"rewards/rejected": -0.6375876665115356,
"step": 1620
},
{
"epoch": 0.14312998046231862,
"grad_norm": 3.299020767211914,
"learning_rate": 9.100000000000001e-06,
"logits/chosen": 2.8275907039642334,
"logits/rejected": 2.9383082389831543,
"logps/chosen": -0.6793197989463806,
"logps/rejected": -2.526850700378418,
"loss": 0.6513,
"nll_loss": 0.05579754710197449,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.20379595458507538,
"rewards/margins": 0.5542593002319336,
"rewards/rejected": -0.7580552101135254,
"step": 1630
},
{
"epoch": 0.14400807850196473,
"grad_norm": 0.914357602596283,
"learning_rate": 9.087654320987655e-06,
"logits/chosen": 2.987334966659546,
"logits/rejected": 2.9897654056549072,
"logps/chosen": -0.665000319480896,
"logps/rejected": -2.4969944953918457,
"loss": 0.6926,
"nll_loss": 0.07977604120969772,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.19950011372566223,
"rewards/margins": 0.5495983362197876,
"rewards/rejected": -0.7490984797477722,
"step": 1640
},
{
"epoch": 0.14488617654161087,
"grad_norm": 2.353787422180176,
"learning_rate": 9.075308641975308e-06,
"logits/chosen": 2.7523000240325928,
"logits/rejected": 2.805290937423706,
"logps/chosen": -0.34468549489974976,
"logps/rejected": -2.1994361877441406,
"loss": 0.6206,
"nll_loss": 0.04046661779284477,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.1034056693315506,
"rewards/margins": 0.5564252138137817,
"rewards/rejected": -0.6598309278488159,
"step": 1650
},
{
"epoch": 0.145764274581257,
"grad_norm": 9.683286666870117,
"learning_rate": 9.062962962962964e-06,
"logits/chosen": 2.8048713207244873,
"logits/rejected": 2.8911221027374268,
"logps/chosen": -0.8389989137649536,
"logps/rejected": -1.6681379079818726,
"loss": 0.7894,
"nll_loss": 0.09216944873332977,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.25169968605041504,
"rewards/margins": 0.24874171614646912,
"rewards/rejected": -0.5004413723945618,
"step": 1660
},
{
"epoch": 0.14664237262090313,
"grad_norm": 9.369816780090332,
"learning_rate": 9.050617283950619e-06,
"logits/chosen": 2.8822600841522217,
"logits/rejected": 2.904628038406372,
"logps/chosen": -0.8616389036178589,
"logps/rejected": -2.4774065017700195,
"loss": 0.6733,
"nll_loss": 0.052507419139146805,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2584916949272156,
"rewards/margins": 0.48473024368286133,
"rewards/rejected": -0.7432219386100769,
"step": 1670
},
{
"epoch": 0.14752047066054924,
"grad_norm": 1.7723337411880493,
"learning_rate": 9.038271604938273e-06,
"logits/chosen": 3.0452260971069336,
"logits/rejected": 3.092968225479126,
"logps/chosen": -0.5382004380226135,
"logps/rejected": -1.853811264038086,
"loss": 0.6863,
"nll_loss": 0.07374037802219391,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.16146014630794525,
"rewards/margins": 0.39468324184417725,
"rewards/rejected": -0.5561434030532837,
"step": 1680
},
{
"epoch": 0.14839856870019538,
"grad_norm": 4.466056823730469,
"learning_rate": 9.025925925925927e-06,
"logits/chosen": 2.8592844009399414,
"logits/rejected": 2.896420955657959,
"logps/chosen": -0.5004099011421204,
"logps/rejected": -1.6472933292388916,
"loss": 0.6771,
"nll_loss": 0.05312333256006241,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1501229852437973,
"rewards/margins": 0.34406501054763794,
"rewards/rejected": -0.49418801069259644,
"step": 1690
},
{
"epoch": 0.1492766667398415,
"grad_norm": 2.437678098678589,
"learning_rate": 9.013580246913582e-06,
"logits/chosen": 2.8448596000671387,
"logits/rejected": 2.8915913105010986,
"logps/chosen": -0.6149319410324097,
"logps/rejected": -1.89218008518219,
"loss": 0.673,
"nll_loss": 0.05624104663729668,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.18447960913181305,
"rewards/margins": 0.3831743597984314,
"rewards/rejected": -0.567654013633728,
"step": 1700
},
{
"epoch": 0.15015476477948764,
"grad_norm": 3.493530750274658,
"learning_rate": 9.001234567901236e-06,
"logits/chosen": 2.781919479370117,
"logits/rejected": 2.7699952125549316,
"logps/chosen": -0.6766859889030457,
"logps/rejected": -2.136669635772705,
"loss": 0.6503,
"nll_loss": 0.06224127486348152,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2030058205127716,
"rewards/margins": 0.43799519538879395,
"rewards/rejected": -0.6410009264945984,
"step": 1710
},
{
"epoch": 0.15103286281913375,
"grad_norm": 3.9590651988983154,
"learning_rate": 8.988888888888889e-06,
"logits/chosen": 2.562682628631592,
"logits/rejected": 2.5419199466705322,
"logps/chosen": -0.965559184551239,
"logps/rejected": -2.73313570022583,
"loss": 0.7286,
"nll_loss": 0.10021784156560898,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2896677553653717,
"rewards/margins": 0.5302730798721313,
"rewards/rejected": -0.8199408650398254,
"step": 1720
},
{
"epoch": 0.1519109608587799,
"grad_norm": 6.649216175079346,
"learning_rate": 8.976543209876543e-06,
"logits/chosen": 2.754970073699951,
"logits/rejected": 2.800556182861328,
"logps/chosen": -0.7695094347000122,
"logps/rejected": -2.4826478958129883,
"loss": 0.7055,
"nll_loss": 0.05943988636136055,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.23085281252861023,
"rewards/margins": 0.5139415264129639,
"rewards/rejected": -0.7447944283485413,
"step": 1730
},
{
"epoch": 0.152789058898426,
"grad_norm": 3.0816867351531982,
"learning_rate": 8.964197530864198e-06,
"logits/chosen": 2.7005701065063477,
"logits/rejected": 2.8037772178649902,
"logps/chosen": -0.7151850461959839,
"logps/rejected": -2.9252617359161377,
"loss": 0.6407,
"nll_loss": 0.06549613177776337,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2145555019378662,
"rewards/margins": 0.6630231142044067,
"rewards/rejected": -0.8775785565376282,
"step": 1740
},
{
"epoch": 0.15366715693807215,
"grad_norm": 3.4558446407318115,
"learning_rate": 8.951851851851852e-06,
"logits/chosen": 2.783221483230591,
"logits/rejected": 2.7478537559509277,
"logps/chosen": -0.3045424818992615,
"logps/rejected": -1.9958875179290771,
"loss": 0.5977,
"nll_loss": 0.030114714056253433,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09136275202035904,
"rewards/margins": 0.507403552532196,
"rewards/rejected": -0.5987662672996521,
"step": 1750
},
{
"epoch": 0.15454525497771826,
"grad_norm": 2.812927722930908,
"learning_rate": 8.939506172839507e-06,
"logits/chosen": 2.567533016204834,
"logits/rejected": 2.5942509174346924,
"logps/chosen": -0.7922319173812866,
"logps/rejected": -1.9200432300567627,
"loss": 0.7427,
"nll_loss": 0.07532784342765808,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.23766958713531494,
"rewards/margins": 0.33834341168403625,
"rewards/rejected": -0.5760129690170288,
"step": 1760
},
{
"epoch": 0.1554233530173644,
"grad_norm": 4.2586750984191895,
"learning_rate": 8.927160493827161e-06,
"logits/chosen": 2.8497743606567383,
"logits/rejected": 2.857257604598999,
"logps/chosen": -1.0099961757659912,
"logps/rejected": -3.4478023052215576,
"loss": 0.6624,
"nll_loss": 0.1071515902876854,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.302998811006546,
"rewards/margins": 0.7313419580459595,
"rewards/rejected": -1.034340739250183,
"step": 1770
},
{
"epoch": 0.15630145105701052,
"grad_norm": 1.7801121473312378,
"learning_rate": 8.914814814814816e-06,
"logits/chosen": 2.7116096019744873,
"logits/rejected": 2.7383835315704346,
"logps/chosen": -0.6959985494613647,
"logps/rejected": -2.842719554901123,
"loss": 0.626,
"nll_loss": 0.07402163743972778,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2087995707988739,
"rewards/margins": 0.6440162658691406,
"rewards/rejected": -0.8528158068656921,
"step": 1780
},
{
"epoch": 0.15717954909665663,
"grad_norm": 2.949233055114746,
"learning_rate": 8.90246913580247e-06,
"logits/chosen": 2.5691332817077637,
"logits/rejected": 2.6201071739196777,
"logps/chosen": -0.7052744626998901,
"logps/rejected": -2.180938720703125,
"loss": 0.6607,
"nll_loss": 0.058830149471759796,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.21158234775066376,
"rewards/margins": 0.4426993429660797,
"rewards/rejected": -0.6542816758155823,
"step": 1790
},
{
"epoch": 0.15805764713630277,
"grad_norm": 0.5553939938545227,
"learning_rate": 8.890123456790124e-06,
"logits/chosen": 2.677717685699463,
"logits/rejected": 2.782273292541504,
"logps/chosen": -0.4674352705478668,
"logps/rejected": -3.086191177368164,
"loss": 0.5701,
"nll_loss": 0.04577519744634628,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.14023058116436005,
"rewards/margins": 0.7856268286705017,
"rewards/rejected": -0.925857424736023,
"step": 1800
},
{
"epoch": 0.15893574517594888,
"grad_norm": 2.8835935592651367,
"learning_rate": 8.877777777777779e-06,
"logits/chosen": 2.745272636413574,
"logits/rejected": 2.806513547897339,
"logps/chosen": -0.5373865962028503,
"logps/rejected": -3.408268451690674,
"loss": 0.6131,
"nll_loss": 0.053614210337400436,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.16121599078178406,
"rewards/margins": 0.8612645864486694,
"rewards/rejected": -1.0224807262420654,
"step": 1810
},
{
"epoch": 0.15981384321559503,
"grad_norm": 6.393036365509033,
"learning_rate": 8.865432098765433e-06,
"logits/chosen": 2.6081528663635254,
"logits/rejected": 2.663269281387329,
"logps/chosen": -0.6596813201904297,
"logps/rejected": -3.0106234550476074,
"loss": 0.6523,
"nll_loss": 0.07575313746929169,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.19790442287921906,
"rewards/margins": 0.7052826881408691,
"rewards/rejected": -0.903187096118927,
"step": 1820
},
{
"epoch": 0.16069194125524114,
"grad_norm": 8.32457160949707,
"learning_rate": 8.853086419753088e-06,
"logits/chosen": 2.1919102668762207,
"logits/rejected": 2.2420222759246826,
"logps/chosen": -0.6085657477378845,
"logps/rejected": -3.387340545654297,
"loss": 0.6396,
"nll_loss": 0.05589609593153,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1825697273015976,
"rewards/margins": 0.8336323499679565,
"rewards/rejected": -1.0162022113800049,
"step": 1830
},
{
"epoch": 0.16157003929488728,
"grad_norm": 8.0199556350708,
"learning_rate": 8.840740740740742e-06,
"logits/chosen": 2.2459845542907715,
"logits/rejected": 2.2368826866149902,
"logps/chosen": -0.904313862323761,
"logps/rejected": -3.4034416675567627,
"loss": 0.7623,
"nll_loss": 0.13848955929279327,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.27129411697387695,
"rewards/margins": 0.7497383952140808,
"rewards/rejected": -1.0210325717926025,
"step": 1840
},
{
"epoch": 0.1624481373345334,
"grad_norm": 3.155104398727417,
"learning_rate": 8.828395061728395e-06,
"logits/chosen": 2.594691514968872,
"logits/rejected": 2.571620464324951,
"logps/chosen": -1.037233591079712,
"logps/rejected": -2.723869800567627,
"loss": 0.7723,
"nll_loss": 0.12755393981933594,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3111700415611267,
"rewards/margins": 0.5059908628463745,
"rewards/rejected": -0.817160964012146,
"step": 1850
},
{
"epoch": 0.16332623537417953,
"grad_norm": 3.548657178878784,
"learning_rate": 8.81604938271605e-06,
"logits/chosen": 2.7853002548217773,
"logits/rejected": 2.796757221221924,
"logps/chosen": -0.5054196119308472,
"logps/rejected": -1.9281543493270874,
"loss": 0.6821,
"nll_loss": 0.06374648213386536,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.15162590146064758,
"rewards/margins": 0.42682045698165894,
"rewards/rejected": -0.5784463286399841,
"step": 1860
},
{
"epoch": 0.16420433341382565,
"grad_norm": 5.429587364196777,
"learning_rate": 8.803703703703704e-06,
"logits/chosen": 2.6975064277648926,
"logits/rejected": 2.7702746391296387,
"logps/chosen": -0.7951546907424927,
"logps/rejected": -2.666987895965576,
"loss": 0.6656,
"nll_loss": 0.06336641311645508,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.2385464459657669,
"rewards/margins": 0.5615500211715698,
"rewards/rejected": -0.8000965118408203,
"step": 1870
},
{
"epoch": 0.1650824314534718,
"grad_norm": 1.150895357131958,
"learning_rate": 8.791358024691358e-06,
"logits/chosen": 2.687932252883911,
"logits/rejected": 2.754683017730713,
"logps/chosen": -0.7847083806991577,
"logps/rejected": -2.204124927520752,
"loss": 0.7186,
"nll_loss": 0.08406446129083633,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.23541252315044403,
"rewards/margins": 0.42582497000694275,
"rewards/rejected": -0.6612375378608704,
"step": 1880
},
{
"epoch": 0.1659605294931179,
"grad_norm": 2.4268455505371094,
"learning_rate": 8.779012345679012e-06,
"logits/chosen": 2.5794925689697266,
"logits/rejected": 2.6048686504364014,
"logps/chosen": -0.6307061314582825,
"logps/rejected": -2.8818671703338623,
"loss": 0.619,
"nll_loss": 0.07149704545736313,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1892118602991104,
"rewards/margins": 0.6753484010696411,
"rewards/rejected": -0.8645601272583008,
"step": 1890
},
{
"epoch": 0.16683862753276404,
"grad_norm": 5.715259075164795,
"learning_rate": 8.766666666666669e-06,
"logits/chosen": 2.707252025604248,
"logits/rejected": 2.6988537311553955,
"logps/chosen": -0.6423169374465942,
"logps/rejected": -1.875507116317749,
"loss": 0.6782,
"nll_loss": 0.04863595962524414,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.19269508123397827,
"rewards/margins": 0.3699570894241333,
"rewards/rejected": -0.5626521706581116,
"step": 1900
},
{
"epoch": 0.16771672557241016,
"grad_norm": 1.6416656970977783,
"learning_rate": 8.754320987654323e-06,
"logits/chosen": 2.792642593383789,
"logits/rejected": 2.8566908836364746,
"logps/chosen": -0.6888980269432068,
"logps/rejected": -2.5319087505340576,
"loss": 0.653,
"nll_loss": 0.06277020275592804,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.206669420003891,
"rewards/margins": 0.5529031753540039,
"rewards/rejected": -0.7595726251602173,
"step": 1910
},
{
"epoch": 0.1685948236120563,
"grad_norm": 1.5920031070709229,
"learning_rate": 8.741975308641976e-06,
"logits/chosen": 2.6872434616088867,
"logits/rejected": 2.6933059692382812,
"logps/chosen": -0.5184003710746765,
"logps/rejected": -1.9327194690704346,
"loss": 0.6515,
"nll_loss": 0.06715475022792816,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.15552011132240295,
"rewards/margins": 0.42429572343826294,
"rewards/rejected": -0.5798158049583435,
"step": 1920
},
{
"epoch": 0.1694729216517024,
"grad_norm": 4.420103073120117,
"learning_rate": 8.72962962962963e-06,
"logits/chosen": 2.835470676422119,
"logits/rejected": 2.8618292808532715,
"logps/chosen": -0.7457289695739746,
"logps/rejected": -2.064795970916748,
"loss": 0.7263,
"nll_loss": 0.10260520875453949,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.22371868789196014,
"rewards/margins": 0.39572006464004517,
"rewards/rejected": -0.6194387674331665,
"step": 1930
},
{
"epoch": 0.17035101969134853,
"grad_norm": 2.6691431999206543,
"learning_rate": 8.717283950617285e-06,
"logits/chosen": 2.6595866680145264,
"logits/rejected": 2.683384656906128,
"logps/chosen": -0.5285651087760925,
"logps/rejected": -1.9386451244354248,
"loss": 0.6966,
"nll_loss": 0.07593102753162384,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.1585695445537567,
"rewards/margins": 0.4230240285396576,
"rewards/rejected": -0.5815936326980591,
"step": 1940
},
{
"epoch": 0.17122911773099467,
"grad_norm": 1.895266056060791,
"learning_rate": 8.704938271604939e-06,
"logits/chosen": 2.801657199859619,
"logits/rejected": 2.762845516204834,
"logps/chosen": -0.6032005548477173,
"logps/rejected": -1.6262376308441162,
"loss": 0.6888,
"nll_loss": 0.060188956558704376,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.18096016347408295,
"rewards/margins": 0.30691108107566833,
"rewards/rejected": -0.4878712594509125,
"step": 1950
},
{
"epoch": 0.17210721577064078,
"grad_norm": 1.7787566184997559,
"learning_rate": 8.692592592592593e-06,
"logits/chosen": 2.8991875648498535,
"logits/rejected": 2.936009645462036,
"logps/chosen": -0.5252435803413391,
"logps/rejected": -1.9678659439086914,
"loss": 0.6254,
"nll_loss": 0.047948211431503296,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.15757307410240173,
"rewards/margins": 0.432786762714386,
"rewards/rejected": -0.5903598070144653,
"step": 1960
},
{
"epoch": 0.17298531381028692,
"grad_norm": 1.326920747756958,
"learning_rate": 8.680246913580248e-06,
"logits/chosen": 2.5659279823303223,
"logits/rejected": 2.590271472930908,
"logps/chosen": -0.5297742486000061,
"logps/rejected": -2.028113842010498,
"loss": 0.6545,
"nll_loss": 0.05374212935566902,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.15893225371837616,
"rewards/margins": 0.4495018422603607,
"rewards/rejected": -0.6084341406822205,
"step": 1970
},
{
"epoch": 0.17386341184993304,
"grad_norm": 4.611727714538574,
"learning_rate": 8.667901234567902e-06,
"logits/chosen": 2.6264684200286865,
"logits/rejected": 2.632913589477539,
"logps/chosen": -0.693698525428772,
"logps/rejected": -3.1472320556640625,
"loss": 0.6069,
"nll_loss": 0.05521649122238159,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.20810957252979279,
"rewards/margins": 0.7360601425170898,
"rewards/rejected": -0.9441697001457214,
"step": 1980
},
{
"epoch": 0.17474150988957918,
"grad_norm": 2.2870845794677734,
"learning_rate": 8.655555555555557e-06,
"logits/chosen": 2.3779568672180176,
"logits/rejected": 2.4282214641571045,
"logps/chosen": -0.24996769428253174,
"logps/rejected": -2.317931890487671,
"loss": 0.5852,
"nll_loss": 0.045966412872076035,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.07499031722545624,
"rewards/margins": 0.6203892230987549,
"rewards/rejected": -0.6953796148300171,
"step": 1990
},
{
"epoch": 0.1756196079292253,
"grad_norm": 1.1437376737594604,
"learning_rate": 8.643209876543211e-06,
"logits/chosen": 2.380004644393921,
"logits/rejected": 2.3685264587402344,
"logps/chosen": -0.5816354751586914,
"logps/rejected": -2.3618433475494385,
"loss": 0.6945,
"nll_loss": 0.07123078405857086,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.17449061572551727,
"rewards/margins": 0.5340624451637268,
"rewards/rejected": -0.70855313539505,
"step": 2000
},
{
"epoch": 0.17649770596887143,
"grad_norm": 9.37255573272705,
"learning_rate": 8.630864197530865e-06,
"logits/chosen": 2.211916446685791,
"logits/rejected": 2.220418930053711,
"logps/chosen": -0.5349146723747253,
"logps/rejected": -2.8715481758117676,
"loss": 0.6366,
"nll_loss": 0.06667280942201614,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.16047440469264984,
"rewards/margins": 0.7009900808334351,
"rewards/rejected": -0.8614645004272461,
"step": 2010
},
{
"epoch": 0.17737580400851755,
"grad_norm": 2.2170488834381104,
"learning_rate": 8.61851851851852e-06,
"logits/chosen": 1.94232177734375,
"logits/rejected": 1.9853718280792236,
"logps/chosen": -0.2896527945995331,
"logps/rejected": -3.063877820968628,
"loss": 0.6189,
"nll_loss": 0.046251922845840454,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.08689583837985992,
"rewards/margins": 0.8322674632072449,
"rewards/rejected": -0.9191633462905884,
"step": 2020
},
{
"epoch": 0.1782539020481637,
"grad_norm": 8.343827247619629,
"learning_rate": 8.606172839506174e-06,
"logits/chosen": 1.9204469919204712,
"logits/rejected": 1.948312759399414,
"logps/chosen": -1.1184568405151367,
"logps/rejected": -3.0377535820007324,
"loss": 0.8225,
"nll_loss": 0.12297489494085312,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3355370759963989,
"rewards/margins": 0.5757889747619629,
"rewards/rejected": -0.9113261103630066,
"step": 2030
},
{
"epoch": 0.1791320000878098,
"grad_norm": 13.199078559875488,
"learning_rate": 8.593827160493829e-06,
"logits/chosen": 2.0205349922180176,
"logits/rejected": 2.0275635719299316,
"logps/chosen": -1.428043007850647,
"logps/rejected": -2.872222661972046,
"loss": 0.9056,
"nll_loss": 0.1807326227426529,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4284129738807678,
"rewards/margins": 0.4332540035247803,
"rewards/rejected": -0.8616668581962585,
"step": 2040
},
{
"epoch": 0.18001009812745594,
"grad_norm": 2.260270357131958,
"learning_rate": 8.581481481481481e-06,
"logits/chosen": 2.2045745849609375,
"logits/rejected": 2.187514543533325,
"logps/chosen": -0.4845626950263977,
"logps/rejected": -2.2464981079101562,
"loss": 0.6956,
"nll_loss": 0.06010964512825012,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1453687846660614,
"rewards/margins": 0.5285806059837341,
"rewards/rejected": -0.6739493608474731,
"step": 2050
},
{
"epoch": 0.18088819616710206,
"grad_norm": 2.032146453857422,
"learning_rate": 8.569135802469136e-06,
"logits/chosen": 2.2264397144317627,
"logits/rejected": 2.265310764312744,
"logps/chosen": -1.0286977291107178,
"logps/rejected": -3.1123714447021484,
"loss": 0.7175,
"nll_loss": 0.11090108007192612,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3086093068122864,
"rewards/margins": 0.6251022815704346,
"rewards/rejected": -0.9337115287780762,
"step": 2060
},
{
"epoch": 0.1817662942067482,
"grad_norm": 0.9217659831047058,
"learning_rate": 8.55679012345679e-06,
"logits/chosen": 2.2858176231384277,
"logits/rejected": 2.315831422805786,
"logps/chosen": -0.6487756967544556,
"logps/rejected": -2.1730990409851074,
"loss": 0.6862,
"nll_loss": 0.07243213802576065,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.19463272392749786,
"rewards/margins": 0.4572969377040863,
"rewards/rejected": -0.6519297361373901,
"step": 2070
},
{
"epoch": 0.1826443922463943,
"grad_norm": 4.933953285217285,
"learning_rate": 8.544444444444445e-06,
"logits/chosen": 2.4036874771118164,
"logits/rejected": 2.3770546913146973,
"logps/chosen": -0.9166983366012573,
"logps/rejected": -2.5144124031066895,
"loss": 0.7355,
"nll_loss": 0.12400822341442108,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.27500951290130615,
"rewards/margins": 0.47931423783302307,
"rewards/rejected": -0.7543236613273621,
"step": 2080
},
{
"epoch": 0.18352249028604042,
"grad_norm": 5.920201778411865,
"learning_rate": 8.532098765432099e-06,
"logits/chosen": 2.504974603652954,
"logits/rejected": 2.541961193084717,
"logps/chosen": -0.8963934779167175,
"logps/rejected": -2.740062713623047,
"loss": 0.7162,
"nll_loss": 0.1004381999373436,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.2689180374145508,
"rewards/margins": 0.5531007051467896,
"rewards/rejected": -0.8220188021659851,
"step": 2090
},
{
"epoch": 0.18440058832568657,
"grad_norm": 0.10835571587085724,
"learning_rate": 8.519753086419754e-06,
"logits/chosen": 2.54685640335083,
"logits/rejected": 2.627911329269409,
"logps/chosen": -0.33035722374916077,
"logps/rejected": -2.806840419769287,
"loss": 0.561,
"nll_loss": 0.03572739288210869,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.09910716861486435,
"rewards/margins": 0.7429450154304504,
"rewards/rejected": -0.8420522809028625,
"step": 2100
},
{
"epoch": 0.18527868636533268,
"grad_norm": 5.237948417663574,
"learning_rate": 8.507407407407408e-06,
"logits/chosen": 2.4657981395721436,
"logits/rejected": 2.541999101638794,
"logps/chosen": -0.5284551382064819,
"logps/rejected": -2.2460741996765137,
"loss": 0.6753,
"nll_loss": 0.055809833109378815,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.15853655338287354,
"rewards/margins": 0.515285849571228,
"rewards/rejected": -0.6738223433494568,
"step": 2110
},
{
"epoch": 0.18615678440497882,
"grad_norm": 6.183863162994385,
"learning_rate": 8.495061728395062e-06,
"logits/chosen": 2.370246171951294,
"logits/rejected": 2.388896942138672,
"logps/chosen": -0.5437101125717163,
"logps/rejected": -2.8997700214385986,
"loss": 0.6217,
"nll_loss": 0.07455585151910782,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1631130427122116,
"rewards/margins": 0.7068179249763489,
"rewards/rejected": -0.8699310421943665,
"step": 2120
},
{
"epoch": 0.18703488244462493,
"grad_norm": 3.2471072673797607,
"learning_rate": 8.482716049382717e-06,
"logits/chosen": 2.301064968109131,
"logits/rejected": 2.3306994438171387,
"logps/chosen": -1.0358989238739014,
"logps/rejected": -3.3612143993377686,
"loss": 0.6543,
"nll_loss": 0.08921568840742111,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.3107697069644928,
"rewards/margins": 0.6975947618484497,
"rewards/rejected": -1.0083644390106201,
"step": 2130
},
{
"epoch": 0.18791298048427107,
"grad_norm": 2.2561800479888916,
"learning_rate": 8.470370370370371e-06,
"logits/chosen": 2.35896635055542,
"logits/rejected": 2.424318790435791,
"logps/chosen": -0.42497625946998596,
"logps/rejected": -3.6937179565429688,
"loss": 0.5684,
"nll_loss": 0.049736388027668,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12749287486076355,
"rewards/margins": 0.9806225895881653,
"rewards/rejected": -1.1081154346466064,
"step": 2140
},
{
"epoch": 0.1887910785239172,
"grad_norm": 3.721003293991089,
"learning_rate": 8.458024691358026e-06,
"logits/chosen": 2.294174909591675,
"logits/rejected": 2.332521915435791,
"logps/chosen": -0.3734773099422455,
"logps/rejected": -2.520681858062744,
"loss": 0.6118,
"nll_loss": 0.03998088836669922,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.11204320192337036,
"rewards/margins": 0.6441613435745239,
"rewards/rejected": -0.7562046051025391,
"step": 2150
},
{
"epoch": 0.18966917656356333,
"grad_norm": 0.09796835482120514,
"learning_rate": 8.44567901234568e-06,
"logits/chosen": 2.144991397857666,
"logits/rejected": 2.220353364944458,
"logps/chosen": -0.5873435139656067,
"logps/rejected": -3.0226333141326904,
"loss": 0.6662,
"nll_loss": 0.05847520753741264,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.17620307207107544,
"rewards/margins": 0.7305869460105896,
"rewards/rejected": -0.9067900776863098,
"step": 2160
},
{
"epoch": 0.19054727460320944,
"grad_norm": 5.563838005065918,
"learning_rate": 8.433333333333334e-06,
"logits/chosen": 2.4212334156036377,
"logits/rejected": 2.4930455684661865,
"logps/chosen": -1.0709176063537598,
"logps/rejected": -3.6808440685272217,
"loss": 0.705,
"nll_loss": 0.1328948587179184,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3212752938270569,
"rewards/margins": 0.7829779386520386,
"rewards/rejected": -1.1042531728744507,
"step": 2170
},
{
"epoch": 0.19142537264285558,
"grad_norm": 2.7498562335968018,
"learning_rate": 8.420987654320987e-06,
"logits/chosen": 2.3793070316314697,
"logits/rejected": 2.4112510681152344,
"logps/chosen": -0.8520647883415222,
"logps/rejected": -3.673933506011963,
"loss": 0.6735,
"nll_loss": 0.11274605989456177,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.25561946630477905,
"rewards/margins": 0.8465606570243835,
"rewards/rejected": -1.102180004119873,
"step": 2180
},
{
"epoch": 0.1923034706825017,
"grad_norm": 2.2553093433380127,
"learning_rate": 8.408641975308642e-06,
"logits/chosen": 2.5548908710479736,
"logits/rejected": 2.6078009605407715,
"logps/chosen": -0.457774817943573,
"logps/rejected": -3.5358822345733643,
"loss": 0.6123,
"nll_loss": 0.05915825441479683,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.13733243942260742,
"rewards/margins": 0.9234321713447571,
"rewards/rejected": -1.0607647895812988,
"step": 2190
},
{
"epoch": 0.19318156872214784,
"grad_norm": 2.100526809692383,
"learning_rate": 8.396296296296296e-06,
"logits/chosen": 2.438559055328369,
"logits/rejected": 2.4799530506134033,
"logps/chosen": -0.5425572395324707,
"logps/rejected": -2.516019582748413,
"loss": 0.6859,
"nll_loss": 0.06712070107460022,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.1627671718597412,
"rewards/margins": 0.5920388698577881,
"rewards/rejected": -0.7548059821128845,
"step": 2200
},
{
"epoch": 0.19405966676179395,
"grad_norm": 0.8824114203453064,
"learning_rate": 8.383950617283952e-06,
"logits/chosen": 2.6323628425598145,
"logits/rejected": 2.694988489151001,
"logps/chosen": -0.5362733006477356,
"logps/rejected": -3.8703293800354004,
"loss": 0.5774,
"nll_loss": 0.0370684489607811,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.16088199615478516,
"rewards/margins": 1.0002167224884033,
"rewards/rejected": -1.1610987186431885,
"step": 2210
},
{
"epoch": 0.1949377648014401,
"grad_norm": 7.0314717292785645,
"learning_rate": 8.371604938271607e-06,
"logits/chosen": 2.701634407043457,
"logits/rejected": 2.749321222305298,
"logps/chosen": -0.914169430732727,
"logps/rejected": -2.8599061965942383,
"loss": 0.7608,
"nll_loss": 0.10523217916488647,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2742508053779602,
"rewards/margins": 0.5837210416793823,
"rewards/rejected": -0.8579719662666321,
"step": 2220
},
{
"epoch": 0.1958158628410862,
"grad_norm": 0.8926441669464111,
"learning_rate": 8.359259259259261e-06,
"logits/chosen": 2.576303720474243,
"logits/rejected": 2.606104850769043,
"logps/chosen": -0.8748048543930054,
"logps/rejected": -2.003169536590576,
"loss": 0.7253,
"nll_loss": 0.07492824643850327,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2624414563179016,
"rewards/margins": 0.33850938081741333,
"rewards/rejected": -0.6009508371353149,
"step": 2230
},
{
"epoch": 0.19669396088073232,
"grad_norm": 1.0316241979599,
"learning_rate": 8.346913580246915e-06,
"logits/chosen": 2.5414836406707764,
"logits/rejected": 2.6227262020111084,
"logps/chosen": -0.6043969988822937,
"logps/rejected": -3.0870282649993896,
"loss": 0.5888,
"nll_loss": 0.04015268385410309,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.18131910264492035,
"rewards/margins": 0.7447894215583801,
"rewards/rejected": -0.9261085391044617,
"step": 2240
},
{
"epoch": 0.19757205892037846,
"grad_norm": 7.171932220458984,
"learning_rate": 8.334567901234568e-06,
"logits/chosen": 2.5014796257019043,
"logits/rejected": 2.4662632942199707,
"logps/chosen": -0.5610159039497375,
"logps/rejected": -2.8177947998046875,
"loss": 0.6317,
"nll_loss": 0.06384368985891342,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.16830478608608246,
"rewards/margins": 0.6770337224006653,
"rewards/rejected": -0.8453385233879089,
"step": 2250
},
{
"epoch": 0.19845015696002458,
"grad_norm": 3.9543685913085938,
"learning_rate": 8.322222222222223e-06,
"logits/chosen": 2.628187656402588,
"logits/rejected": 2.585869312286377,
"logps/chosen": -0.62028568983078,
"logps/rejected": -2.7624683380126953,
"loss": 0.6441,
"nll_loss": 0.06617014110088348,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.18608573079109192,
"rewards/margins": 0.6426547765731812,
"rewards/rejected": -0.8287404775619507,
"step": 2260
},
{
"epoch": 0.19932825499967072,
"grad_norm": 4.6166815757751465,
"learning_rate": 8.309876543209877e-06,
"logits/chosen": 2.4966917037963867,
"logits/rejected": 2.537562847137451,
"logps/chosen": -0.963221549987793,
"logps/rejected": -3.5489554405212402,
"loss": 0.663,
"nll_loss": 0.06778384000062943,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28896647691726685,
"rewards/margins": 0.7757201790809631,
"rewards/rejected": -1.06468665599823,
"step": 2270
},
{
"epoch": 0.20020635303931683,
"grad_norm": 11.345588684082031,
"learning_rate": 8.297530864197531e-06,
"logits/chosen": 2.5486679077148438,
"logits/rejected": 2.4989380836486816,
"logps/chosen": -0.6900479197502136,
"logps/rejected": -2.254317045211792,
"loss": 0.7252,
"nll_loss": 0.07977007329463959,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.20701436698436737,
"rewards/margins": 0.4692806601524353,
"rewards/rejected": -0.6762951016426086,
"step": 2280
},
{
"epoch": 0.20108445107896297,
"grad_norm": 0.1733788251876831,
"learning_rate": 8.285185185185186e-06,
"logits/chosen": 2.6525139808654785,
"logits/rejected": 2.702258586883545,
"logps/chosen": -0.6712150573730469,
"logps/rejected": -3.3791470527648926,
"loss": 0.6285,
"nll_loss": 0.06629864871501923,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.20136454701423645,
"rewards/margins": 0.8123796582221985,
"rewards/rejected": -1.0137441158294678,
"step": 2290
},
{
"epoch": 0.20196254911860909,
"grad_norm": 2.2842109203338623,
"learning_rate": 8.27283950617284e-06,
"logits/chosen": 2.430759906768799,
"logits/rejected": 2.509899616241455,
"logps/chosen": -0.612421989440918,
"logps/rejected": -2.978832960128784,
"loss": 0.6656,
"nll_loss": 0.0659157857298851,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.18372659385204315,
"rewards/margins": 0.7099233865737915,
"rewards/rejected": -0.8936498761177063,
"step": 2300
},
{
"epoch": 0.20284064715825523,
"grad_norm": 6.801368713378906,
"learning_rate": 8.260493827160495e-06,
"logits/chosen": 2.424044609069824,
"logits/rejected": 2.4345576763153076,
"logps/chosen": -0.4499019682407379,
"logps/rejected": -3.497096300125122,
"loss": 0.5669,
"nll_loss": 0.03968087583780289,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.13497060537338257,
"rewards/margins": 0.9141584634780884,
"rewards/rejected": -1.0491290092468262,
"step": 2310
},
{
"epoch": 0.20371874519790134,
"grad_norm": 4.946938514709473,
"learning_rate": 8.248148148148149e-06,
"logits/chosen": 2.310943841934204,
"logits/rejected": 2.327831983566284,
"logps/chosen": -0.6226638555526733,
"logps/rejected": -5.598433971405029,
"loss": 0.4657,
"nll_loss": 0.053391944617033005,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.18679918348789215,
"rewards/margins": 1.4927312135696411,
"rewards/rejected": -1.679530382156372,
"step": 2320
},
{
"epoch": 0.20459684323754748,
"grad_norm": 24.650245666503906,
"learning_rate": 8.235802469135803e-06,
"logits/chosen": 2.0664007663726807,
"logits/rejected": 2.158742904663086,
"logps/chosen": -2.7977170944213867,
"logps/rejected": -6.907550811767578,
"loss": 1.5409,
"nll_loss": 0.6891128420829773,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8393152356147766,
"rewards/margins": 1.2329500913619995,
"rewards/rejected": -2.072265148162842,
"step": 2330
},
{
"epoch": 0.2054749412771936,
"grad_norm": 2.8847906589508057,
"learning_rate": 8.223456790123458e-06,
"logits/chosen": 2.0425117015838623,
"logits/rejected": 2.046208381652832,
"logps/chosen": -0.8856005668640137,
"logps/rejected": -3.2776896953582764,
"loss": 0.7465,
"nll_loss": 0.10851933062076569,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.2656802237033844,
"rewards/margins": 0.717626690864563,
"rewards/rejected": -0.983306884765625,
"step": 2340
},
{
"epoch": 0.20635303931683974,
"grad_norm": 6.361351013183594,
"learning_rate": 8.211111111111112e-06,
"logits/chosen": 2.2434327602386475,
"logits/rejected": 2.3048999309539795,
"logps/chosen": -0.753734290599823,
"logps/rejected": -3.09961199760437,
"loss": 0.6477,
"nll_loss": 0.05653975531458855,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.22612027823925018,
"rewards/margins": 0.7037633657455444,
"rewards/rejected": -0.929883599281311,
"step": 2350
},
{
"epoch": 0.20723113735648585,
"grad_norm": 4.796877384185791,
"learning_rate": 8.198765432098767e-06,
"logits/chosen": 2.420621156692505,
"logits/rejected": 2.454742670059204,
"logps/chosen": -0.5336098670959473,
"logps/rejected": -2.5250916481018066,
"loss": 0.6861,
"nll_loss": 0.07172581553459167,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.16008298099040985,
"rewards/margins": 0.5974445343017578,
"rewards/rejected": -0.7575275897979736,
"step": 2360
},
{
"epoch": 0.208109235396132,
"grad_norm": 6.713689804077148,
"learning_rate": 8.186419753086421e-06,
"logits/chosen": 2.700634241104126,
"logits/rejected": 2.761862277984619,
"logps/chosen": -0.9955413937568665,
"logps/rejected": -3.3551056385040283,
"loss": 0.7475,
"nll_loss": 0.11970362812280655,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2986624538898468,
"rewards/margins": 0.7078693509101868,
"rewards/rejected": -1.0065317153930664,
"step": 2370
},
{
"epoch": 0.2089873334357781,
"grad_norm": 5.509604454040527,
"learning_rate": 8.174074074074074e-06,
"logits/chosen": 2.601839780807495,
"logits/rejected": 2.6365180015563965,
"logps/chosen": -0.7382031679153442,
"logps/rejected": -2.511854648590088,
"loss": 0.6877,
"nll_loss": 0.058238618075847626,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.2214609682559967,
"rewards/margins": 0.5320954918861389,
"rewards/rejected": -0.7535563707351685,
"step": 2380
},
{
"epoch": 0.20986543147542422,
"grad_norm": 0.9899409413337708,
"learning_rate": 8.161728395061728e-06,
"logits/chosen": 2.59370493888855,
"logits/rejected": 2.653318166732788,
"logps/chosen": -0.43372973799705505,
"logps/rejected": -2.1165356636047363,
"loss": 0.6498,
"nll_loss": 0.055299948900938034,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1301189363002777,
"rewards/margins": 0.5048418045043945,
"rewards/rejected": -0.6349607706069946,
"step": 2390
},
{
"epoch": 0.21074352951507036,
"grad_norm": 2.4443204402923584,
"learning_rate": 8.149382716049383e-06,
"logits/chosen": 2.621575355529785,
"logits/rejected": 2.6399495601654053,
"logps/chosen": -0.6197006702423096,
"logps/rejected": -2.7705483436584473,
"loss": 0.6775,
"nll_loss": 0.09681596606969833,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.1859102100133896,
"rewards/margins": 0.6452543139457703,
"rewards/rejected": -0.831164538860321,
"step": 2400
},
{
"epoch": 0.21162162755471647,
"grad_norm": 3.302237033843994,
"learning_rate": 8.137037037037037e-06,
"logits/chosen": 2.6685478687286377,
"logits/rejected": 2.645535469055176,
"logps/chosen": -0.5314685106277466,
"logps/rejected": -2.784475803375244,
"loss": 0.6021,
"nll_loss": 0.05336238071322441,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.1594405472278595,
"rewards/margins": 0.675902247428894,
"rewards/rejected": -0.8353427052497864,
"step": 2410
},
{
"epoch": 0.21249972559436262,
"grad_norm": 1.3908727169036865,
"learning_rate": 8.124691358024692e-06,
"logits/chosen": 2.372706413269043,
"logits/rejected": 2.4324469566345215,
"logps/chosen": -0.5971062779426575,
"logps/rejected": -2.159764051437378,
"loss": 0.6943,
"nll_loss": 0.05865710228681564,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.179131880402565,
"rewards/margins": 0.4687972664833069,
"rewards/rejected": -0.6479291915893555,
"step": 2420
},
{
"epoch": 0.21337782363400873,
"grad_norm": 2.2388875484466553,
"learning_rate": 8.112345679012346e-06,
"logits/chosen": 2.378962993621826,
"logits/rejected": 2.4710259437561035,
"logps/chosen": -0.6657778024673462,
"logps/rejected": -2.7295007705688477,
"loss": 0.6557,
"nll_loss": 0.07417738437652588,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.19973333179950714,
"rewards/margins": 0.6191169619560242,
"rewards/rejected": -0.8188502192497253,
"step": 2430
},
{
"epoch": 0.21425592167365487,
"grad_norm": 11.22681713104248,
"learning_rate": 8.1e-06,
"logits/chosen": 2.5654773712158203,
"logits/rejected": 2.5966382026672363,
"logps/chosen": -0.5373214483261108,
"logps/rejected": -2.4600396156311035,
"loss": 0.6436,
"nll_loss": 0.07008077204227448,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.16119642555713654,
"rewards/margins": 0.5768154263496399,
"rewards/rejected": -0.7380119562149048,
"step": 2440
},
{
"epoch": 0.21513401971330098,
"grad_norm": 3.76454496383667,
"learning_rate": 8.087654320987655e-06,
"logits/chosen": 2.541652202606201,
"logits/rejected": 2.5319390296936035,
"logps/chosen": -0.7174406051635742,
"logps/rejected": -2.493823528289795,
"loss": 0.694,
"nll_loss": 0.08293718844652176,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.21523217856884003,
"rewards/margins": 0.5329148173332214,
"rewards/rejected": -0.7481471300125122,
"step": 2450
},
{
"epoch": 0.21601211775294712,
"grad_norm": 6.552890777587891,
"learning_rate": 8.07530864197531e-06,
"logits/chosen": 2.4907002449035645,
"logits/rejected": 2.483581066131592,
"logps/chosen": -0.5098174214363098,
"logps/rejected": -2.033862829208374,
"loss": 0.6884,
"nll_loss": 0.06638985127210617,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.15294523537158966,
"rewards/margins": 0.4572136402130127,
"rewards/rejected": -0.6101589202880859,
"step": 2460
},
{
"epoch": 0.21689021579259324,
"grad_norm": 1.994023323059082,
"learning_rate": 8.062962962962964e-06,
"logits/chosen": 2.6207573413848877,
"logits/rejected": 2.5812675952911377,
"logps/chosen": -0.7166529297828674,
"logps/rejected": -2.042680263519287,
"loss": 0.673,
"nll_loss": 0.04922042042016983,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.21499589085578918,
"rewards/margins": 0.3978081941604614,
"rewards/rejected": -0.6128040552139282,
"step": 2470
},
{
"epoch": 0.21776831383223938,
"grad_norm": 1.6430639028549194,
"learning_rate": 8.050617283950618e-06,
"logits/chosen": 2.2188925743103027,
"logits/rejected": 2.256579875946045,
"logps/chosen": -0.4708133637905121,
"logps/rejected": -1.9442718029022217,
"loss": 0.643,
"nll_loss": 0.043398790061473846,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.14124402403831482,
"rewards/margins": 0.44203758239746094,
"rewards/rejected": -0.5832816362380981,
"step": 2480
},
{
"epoch": 0.2186464118718855,
"grad_norm": 2.5793867111206055,
"learning_rate": 8.038271604938272e-06,
"logits/chosen": 2.5893726348876953,
"logits/rejected": 2.5955262184143066,
"logps/chosen": -0.9521909952163696,
"logps/rejected": -2.2524361610412598,
"loss": 0.7949,
"nll_loss": 0.09755026549100876,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.28565728664398193,
"rewards/margins": 0.3900734782218933,
"rewards/rejected": -0.6757307648658752,
"step": 2490
},
{
"epoch": 0.21952450991153163,
"grad_norm": 3.0991997718811035,
"learning_rate": 8.025925925925927e-06,
"logits/chosen": 2.4973714351654053,
"logits/rejected": 2.5837717056274414,
"logps/chosen": -0.7727764248847961,
"logps/rejected": -1.6810334920883179,
"loss": 0.7202,
"nll_loss": 0.08514519035816193,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.23183290660381317,
"rewards/margins": 0.2724771797657013,
"rewards/rejected": -0.5043100714683533,
"step": 2500
},
{
"epoch": 0.22040260795117775,
"grad_norm": 0.889065682888031,
"learning_rate": 8.01358024691358e-06,
"logits/chosen": 2.623380422592163,
"logits/rejected": 2.6181862354278564,
"logps/chosen": -0.6605237126350403,
"logps/rejected": -2.3106017112731934,
"loss": 0.6626,
"nll_loss": 0.056938063353300095,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.19815710186958313,
"rewards/margins": 0.4950234889984131,
"rewards/rejected": -0.6931806206703186,
"step": 2510
},
{
"epoch": 0.2212807059908239,
"grad_norm": 1.9629164934158325,
"learning_rate": 8.001234567901234e-06,
"logits/chosen": 2.725886583328247,
"logits/rejected": 2.8111538887023926,
"logps/chosen": -0.9530594944953918,
"logps/rejected": -2.9670989513397217,
"loss": 0.6788,
"nll_loss": 0.08854852616786957,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.28591784834861755,
"rewards/margins": 0.6042118072509766,
"rewards/rejected": -0.8901296854019165,
"step": 2520
},
{
"epoch": 0.22215880403047,
"grad_norm": 2.8490424156188965,
"learning_rate": 7.98888888888889e-06,
"logits/chosen": 2.6199440956115723,
"logits/rejected": 2.5510783195495605,
"logps/chosen": -0.7600888013839722,
"logps/rejected": -2.3557190895080566,
"loss": 0.6933,
"nll_loss": 0.07890000194311142,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.22802665829658508,
"rewards/margins": 0.47868919372558594,
"rewards/rejected": -0.7067158222198486,
"step": 2530
},
{
"epoch": 0.22303690207011612,
"grad_norm": 2.2429914474487305,
"learning_rate": 7.976543209876545e-06,
"logits/chosen": 2.657155752182007,
"logits/rejected": 2.730384349822998,
"logps/chosen": -0.6944109201431274,
"logps/rejected": -2.3378891944885254,
"loss": 0.6597,
"nll_loss": 0.07231010496616364,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.20832328498363495,
"rewards/margins": 0.4930434226989746,
"rewards/rejected": -0.7013667821884155,
"step": 2540
},
{
"epoch": 0.22391500010976226,
"grad_norm": 8.76103401184082,
"learning_rate": 7.964197530864199e-06,
"logits/chosen": 2.425436019897461,
"logits/rejected": 2.432227849960327,
"logps/chosen": -0.8007850646972656,
"logps/rejected": -1.2312095165252686,
"loss": 0.8001,
"nll_loss": 0.08954410254955292,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.24023552238941193,
"rewards/margins": 0.1291273534297943,
"rewards/rejected": -0.36936289072036743,
"step": 2550
},
{
"epoch": 0.22479309814940837,
"grad_norm": 0.9743072986602783,
"learning_rate": 7.951851851851853e-06,
"logits/chosen": 2.5851752758026123,
"logits/rejected": 2.6295394897460938,
"logps/chosen": -0.5336810946464539,
"logps/rejected": -1.8357422351837158,
"loss": 0.6645,
"nll_loss": 0.0451740063726902,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.16010431945323944,
"rewards/margins": 0.39061832427978516,
"rewards/rejected": -0.5507226586341858,
"step": 2560
},
{
"epoch": 0.2256711961890545,
"grad_norm": 16.18039321899414,
"learning_rate": 7.939506172839508e-06,
"logits/chosen": 2.548182487487793,
"logits/rejected": 2.511976718902588,
"logps/chosen": -0.34628647565841675,
"logps/rejected": -1.9476335048675537,
"loss": 0.6362,
"nll_loss": 0.043379928916692734,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.1038859486579895,
"rewards/margins": 0.4804041385650635,
"rewards/rejected": -0.584290087223053,
"step": 2570
},
{
"epoch": 0.22654929422870063,
"grad_norm": 0.8062020540237427,
"learning_rate": 7.92716049382716e-06,
"logits/chosen": 2.4778008460998535,
"logits/rejected": 2.540804147720337,
"logps/chosen": -0.5255548357963562,
"logps/rejected": -2.09405779838562,
"loss": 0.6816,
"nll_loss": 0.07233087718486786,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.15766644477844238,
"rewards/margins": 0.4705510139465332,
"rewards/rejected": -0.6282175183296204,
"step": 2580
},
{
"epoch": 0.22742739226834677,
"grad_norm": 0.9400202631950378,
"learning_rate": 7.914814814814815e-06,
"logits/chosen": 2.3705923557281494,
"logits/rejected": 2.3822696208953857,
"logps/chosen": -0.29244324564933777,
"logps/rejected": -1.8687235116958618,
"loss": 0.643,
"nll_loss": 0.0392463319003582,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.08773298561573029,
"rewards/margins": 0.47288402915000916,
"rewards/rejected": -0.5606169700622559,
"step": 2590
},
{
"epoch": 0.22830549030799288,
"grad_norm": 2.667971134185791,
"learning_rate": 7.90246913580247e-06,
"logits/chosen": 2.3539376258850098,
"logits/rejected": 2.370870351791382,
"logps/chosen": -0.8493935465812683,
"logps/rejected": -2.638115882873535,
"loss": 0.7404,
"nll_loss": 0.10912100225687027,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2548181116580963,
"rewards/margins": 0.5366166830062866,
"rewards/rejected": -0.7914347648620605,
"step": 2600
},
{
"epoch": 0.22918358834763902,
"grad_norm": 2.8281736373901367,
"learning_rate": 7.890123456790124e-06,
"logits/chosen": 2.146486759185791,
"logits/rejected": 2.068389415740967,
"logps/chosen": -0.5109766125679016,
"logps/rejected": -1.839223861694336,
"loss": 0.6687,
"nll_loss": 0.056281328201293945,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.15329298377037048,
"rewards/margins": 0.39847415685653687,
"rewards/rejected": -0.5517671704292297,
"step": 2610
},
{
"epoch": 0.23006168638728514,
"grad_norm": 0.8495587706565857,
"learning_rate": 7.877777777777778e-06,
"logits/chosen": 2.6396515369415283,
"logits/rejected": 2.7024216651916504,
"logps/chosen": -0.6976840496063232,
"logps/rejected": -2.1378865242004395,
"loss": 0.6814,
"nll_loss": 0.056956302374601364,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20930524170398712,
"rewards/margins": 0.43206077814102173,
"rewards/rejected": -0.6413660049438477,
"step": 2620
},
{
"epoch": 0.23093978442693128,
"grad_norm": 3.5757088661193848,
"learning_rate": 7.865432098765433e-06,
"logits/chosen": 2.367114305496216,
"logits/rejected": 2.4596433639526367,
"logps/chosen": -0.8233789205551147,
"logps/rejected": -3.531553268432617,
"loss": 0.6048,
"nll_loss": 0.07218165695667267,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.24701371788978577,
"rewards/margins": 0.8124523162841797,
"rewards/rejected": -1.059466004371643,
"step": 2630
},
{
"epoch": 0.2318178824665774,
"grad_norm": 1.7588326930999756,
"learning_rate": 7.853086419753087e-06,
"logits/chosen": 2.2890639305114746,
"logits/rejected": 2.2860817909240723,
"logps/chosen": -0.31422901153564453,
"logps/rejected": -2.4267849922180176,
"loss": 0.584,
"nll_loss": 0.03038870170712471,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09426870942115784,
"rewards/margins": 0.6337667107582092,
"rewards/rejected": -0.7280355095863342,
"step": 2640
},
{
"epoch": 0.23269598050622353,
"grad_norm": 4.534848213195801,
"learning_rate": 7.840740740740741e-06,
"logits/chosen": 2.2893667221069336,
"logits/rejected": 2.3961310386657715,
"logps/chosen": -1.1099700927734375,
"logps/rejected": -3.46467924118042,
"loss": 0.7172,
"nll_loss": 0.1096486896276474,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3329910337924957,
"rewards/margins": 0.7064129114151001,
"rewards/rejected": -1.0394039154052734,
"step": 2650
},
{
"epoch": 0.23357407854586965,
"grad_norm": 4.673385143280029,
"learning_rate": 7.828395061728396e-06,
"logits/chosen": 2.296675205230713,
"logits/rejected": 2.2739694118499756,
"logps/chosen": -0.5930169820785522,
"logps/rejected": -2.1158299446105957,
"loss": 0.6747,
"nll_loss": 0.06488887220621109,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1779050976037979,
"rewards/margins": 0.4568440020084381,
"rewards/rejected": -0.6347490549087524,
"step": 2660
},
{
"epoch": 0.23445217658551576,
"grad_norm": 4.619322776794434,
"learning_rate": 7.81604938271605e-06,
"logits/chosen": 2.316483974456787,
"logits/rejected": 2.2958438396453857,
"logps/chosen": -0.8736156225204468,
"logps/rejected": -2.8213837146759033,
"loss": 0.7145,
"nll_loss": 0.0816427692770958,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2620847225189209,
"rewards/margins": 0.5843304395675659,
"rewards/rejected": -0.8464152216911316,
"step": 2670
},
{
"epoch": 0.2353302746251619,
"grad_norm": 2.9810938835144043,
"learning_rate": 7.803703703703705e-06,
"logits/chosen": 2.459575891494751,
"logits/rejected": 2.5659358501434326,
"logps/chosen": -0.7047882080078125,
"logps/rejected": -2.6978650093078613,
"loss": 0.6257,
"nll_loss": 0.048231981694698334,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.21143648028373718,
"rewards/margins": 0.5979229807853699,
"rewards/rejected": -0.8093594312667847,
"step": 2680
},
{
"epoch": 0.23620837266480801,
"grad_norm": 1.5284343957901,
"learning_rate": 7.791358024691359e-06,
"logits/chosen": 2.327259063720703,
"logits/rejected": 2.342414379119873,
"logps/chosen": -0.630928635597229,
"logps/rejected": -1.5021655559539795,
"loss": 0.749,
"nll_loss": 0.08067157119512558,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.18927858769893646,
"rewards/margins": 0.26137107610702515,
"rewards/rejected": -0.4506497383117676,
"step": 2690
},
{
"epoch": 0.23708647070445416,
"grad_norm": 2.211660385131836,
"learning_rate": 7.779012345679014e-06,
"logits/chosen": 2.6317856311798096,
"logits/rejected": 2.615809679031372,
"logps/chosen": -0.5954752564430237,
"logps/rejected": -2.3459115028381348,
"loss": 0.6778,
"nll_loss": 0.08433017879724503,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.17864257097244263,
"rewards/margins": 0.5251308679580688,
"rewards/rejected": -0.7037734389305115,
"step": 2700
},
{
"epoch": 0.23796456874410027,
"grad_norm": 1.088346004486084,
"learning_rate": 7.766666666666666e-06,
"logits/chosen": 2.3896658420562744,
"logits/rejected": 2.4483752250671387,
"logps/chosen": -0.9959812164306641,
"logps/rejected": -2.1024794578552246,
"loss": 0.7702,
"nll_loss": 0.07456602156162262,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.29879438877105713,
"rewards/margins": 0.33194953203201294,
"rewards/rejected": -0.6307438611984253,
"step": 2710
},
{
"epoch": 0.2388426667837464,
"grad_norm": 0.024926647543907166,
"learning_rate": 7.75432098765432e-06,
"logits/chosen": 2.4478306770324707,
"logits/rejected": 2.464536190032959,
"logps/chosen": -0.5027719736099243,
"logps/rejected": -2.434457540512085,
"loss": 0.6162,
"nll_loss": 0.037077441811561584,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.15083159506320953,
"rewards/margins": 0.5795056819915771,
"rewards/rejected": -0.730337381362915,
"step": 2720
},
{
"epoch": 0.23972076482339252,
"grad_norm": 2.2997021675109863,
"learning_rate": 7.741975308641975e-06,
"logits/chosen": 2.381772518157959,
"logits/rejected": 2.4063820838928223,
"logps/chosen": -0.2956869602203369,
"logps/rejected": -3.0764455795288086,
"loss": 0.5462,
"nll_loss": 0.0379708856344223,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.08870609104633331,
"rewards/margins": 0.834227442741394,
"rewards/rejected": -0.9229336977005005,
"step": 2730
},
{
"epoch": 0.24059886286303866,
"grad_norm": 0.1084788367152214,
"learning_rate": 7.72962962962963e-06,
"logits/chosen": 2.582411766052246,
"logits/rejected": 2.6022956371307373,
"logps/chosen": -0.8210613131523132,
"logps/rejected": -1.958987832069397,
"loss": 0.7403,
"nll_loss": 0.07586108148097992,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.24631838500499725,
"rewards/margins": 0.34137797355651855,
"rewards/rejected": -0.5876964330673218,
"step": 2740
},
{
"epoch": 0.24147696090268478,
"grad_norm": 3.7977356910705566,
"learning_rate": 7.717283950617284e-06,
"logits/chosen": 2.4403786659240723,
"logits/rejected": 2.398824453353882,
"logps/chosen": -0.5754778981208801,
"logps/rejected": -2.9741971492767334,
"loss": 0.631,
"nll_loss": 0.06641169637441635,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.17264336347579956,
"rewards/margins": 0.7196158170700073,
"rewards/rejected": -0.8922592401504517,
"step": 2750
},
{
"epoch": 0.24235505894233092,
"grad_norm": 1.7483222484588623,
"learning_rate": 7.70493827160494e-06,
"logits/chosen": 2.11322021484375,
"logits/rejected": 2.1725521087646484,
"logps/chosen": -0.5592783689498901,
"logps/rejected": -1.9913737773895264,
"loss": 0.671,
"nll_loss": 0.07330699265003204,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.16778354346752167,
"rewards/margins": 0.4296286106109619,
"rewards/rejected": -0.5974121689796448,
"step": 2760
},
{
"epoch": 0.24323315698197703,
"grad_norm": 1.0311236381530762,
"learning_rate": 7.692592592592594e-06,
"logits/chosen": 2.309854507446289,
"logits/rejected": 2.313572406768799,
"logps/chosen": -0.6930335760116577,
"logps/rejected": -2.353086233139038,
"loss": 0.7228,
"nll_loss": 0.0806727483868599,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.20791009068489075,
"rewards/margins": 0.498015820980072,
"rewards/rejected": -0.7059258818626404,
"step": 2770
},
{
"epoch": 0.24411125502162317,
"grad_norm": 4.491925239562988,
"learning_rate": 7.680246913580247e-06,
"logits/chosen": 2.534341335296631,
"logits/rejected": 2.560044050216675,
"logps/chosen": -0.536239743232727,
"logps/rejected": -2.622220993041992,
"loss": 0.6372,
"nll_loss": 0.05687220022082329,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1608719378709793,
"rewards/margins": 0.625794529914856,
"rewards/rejected": -0.7866664528846741,
"step": 2780
},
{
"epoch": 0.2449893530612693,
"grad_norm": 0.7264915108680725,
"learning_rate": 7.667901234567902e-06,
"logits/chosen": 2.2637343406677246,
"logits/rejected": 2.312181234359741,
"logps/chosen": -0.7516659498214722,
"logps/rejected": -1.9516077041625977,
"loss": 0.7264,
"nll_loss": 0.07927460223436356,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.22549979388713837,
"rewards/margins": 0.3599824607372284,
"rewards/rejected": -0.5854822993278503,
"step": 2790
},
{
"epoch": 0.24586745110091543,
"grad_norm": 3.0847902297973633,
"learning_rate": 7.655555555555556e-06,
"logits/chosen": 2.367601156234741,
"logits/rejected": 2.403787612915039,
"logps/chosen": -0.41557034850120544,
"logps/rejected": -2.6111361980438232,
"loss": 0.6177,
"nll_loss": 0.04188116267323494,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12467111647129059,
"rewards/margins": 0.6586698889732361,
"rewards/rejected": -0.7833409905433655,
"step": 2800
},
{
"epoch": 0.24674554914056154,
"grad_norm": 4.381664752960205,
"learning_rate": 7.64320987654321e-06,
"logits/chosen": 2.159615993499756,
"logits/rejected": 2.1967501640319824,
"logps/chosen": -0.8560575246810913,
"logps/rejected": -2.5341227054595947,
"loss": 0.6988,
"nll_loss": 0.0791650265455246,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2568172812461853,
"rewards/margins": 0.5034195780754089,
"rewards/rejected": -0.7602368593215942,
"step": 2810
},
{
"epoch": 0.24762364718020766,
"grad_norm": 1.5694470405578613,
"learning_rate": 7.630864197530865e-06,
"logits/chosen": 2.573787212371826,
"logits/rejected": 2.6199076175689697,
"logps/chosen": -0.3929263949394226,
"logps/rejected": -2.310509443283081,
"loss": 0.5938,
"nll_loss": 0.04064936563372612,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.11787792295217514,
"rewards/margins": 0.5752750039100647,
"rewards/rejected": -0.6931529641151428,
"step": 2820
},
{
"epoch": 0.2485017452198538,
"grad_norm": 0.7673569321632385,
"learning_rate": 7.618518518518519e-06,
"logits/chosen": 2.1453781127929688,
"logits/rejected": 2.2463371753692627,
"logps/chosen": -0.40726566314697266,
"logps/rejected": -2.0054023265838623,
"loss": 0.658,
"nll_loss": 0.048675037920475006,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.12217970192432404,
"rewards/margins": 0.4794410765171051,
"rewards/rejected": -0.6016206741333008,
"step": 2830
},
{
"epoch": 0.2493798432594999,
"grad_norm": 0.3016711473464966,
"learning_rate": 7.606172839506173e-06,
"logits/chosen": 2.3802428245544434,
"logits/rejected": 2.4284090995788574,
"logps/chosen": -0.2703506350517273,
"logps/rejected": -2.1822762489318848,
"loss": 0.6137,
"nll_loss": 0.03810378909111023,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.08110519498586655,
"rewards/margins": 0.5735777020454407,
"rewards/rejected": -0.6546828746795654,
"step": 2840
},
{
"epoch": 0.250257941299146,
"grad_norm": 7.489548683166504,
"learning_rate": 7.593827160493827e-06,
"logits/chosen": 2.248429298400879,
"logits/rejected": 2.308821678161621,
"logps/chosen": -0.8789108991622925,
"logps/rejected": -3.1246941089630127,
"loss": 0.664,
"nll_loss": 0.07486443221569061,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.2636732757091522,
"rewards/margins": 0.6737349033355713,
"rewards/rejected": -0.9374082684516907,
"step": 2850
},
{
"epoch": 0.2511360393387922,
"grad_norm": 1.4026418924331665,
"learning_rate": 7.581481481481482e-06,
"logits/chosen": 2.4665749073028564,
"logits/rejected": 2.4932830333709717,
"logps/chosen": -0.6312376260757446,
"logps/rejected": -2.414559841156006,
"loss": 0.6769,
"nll_loss": 0.043715715408325195,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.18937130272388458,
"rewards/margins": 0.5349966883659363,
"rewards/rejected": -0.7243679761886597,
"step": 2860
},
{
"epoch": 0.2520141373784383,
"grad_norm": 6.41206169128418,
"learning_rate": 7.569135802469136e-06,
"logits/chosen": 1.9791500568389893,
"logits/rejected": 2.065732479095459,
"logps/chosen": -0.3990306854248047,
"logps/rejected": -2.8924102783203125,
"loss": 0.5879,
"nll_loss": 0.03257184475660324,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.11970920860767365,
"rewards/margins": 0.7480138540267944,
"rewards/rejected": -0.8677231073379517,
"step": 2870
},
{
"epoch": 0.2528922354180844,
"grad_norm": 0.007237335667014122,
"learning_rate": 7.5567901234567905e-06,
"logits/chosen": 2.199582099914551,
"logits/rejected": 2.2399208545684814,
"logps/chosen": -0.5396376848220825,
"logps/rejected": -2.956026554107666,
"loss": 0.6215,
"nll_loss": 0.05001888796687126,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.16189131140708923,
"rewards/margins": 0.7249167561531067,
"rewards/rejected": -0.8868080377578735,
"step": 2880
},
{
"epoch": 0.25377033345773053,
"grad_norm": 1.490009069442749,
"learning_rate": 7.544444444444445e-06,
"logits/chosen": 2.2380213737487793,
"logits/rejected": 2.329550266265869,
"logps/chosen": -0.5914583206176758,
"logps/rejected": -3.31396484375,
"loss": 0.6415,
"nll_loss": 0.07680721580982208,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.17743751406669617,
"rewards/margins": 0.8167519569396973,
"rewards/rejected": -0.9941895604133606,
"step": 2890
},
{
"epoch": 0.2546484314973767,
"grad_norm": 4.014424800872803,
"learning_rate": 7.5320987654321e-06,
"logits/chosen": 1.9832671880722046,
"logits/rejected": 2.1117234230041504,
"logps/chosen": -0.7208765745162964,
"logps/rejected": -2.1712794303894043,
"loss": 0.7748,
"nll_loss": 0.10638797283172607,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.21626298129558563,
"rewards/margins": 0.43512091040611267,
"rewards/rejected": -0.6513839364051819,
"step": 2900
},
{
"epoch": 0.2555265295370228,
"grad_norm": 0.9837947487831116,
"learning_rate": 7.519753086419753e-06,
"logits/chosen": 2.266629695892334,
"logits/rejected": 2.3368468284606934,
"logps/chosen": -0.5821598172187805,
"logps/rejected": -3.631063938140869,
"loss": 0.622,
"nll_loss": 0.04266344755887985,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.1746479570865631,
"rewards/margins": 0.9146712422370911,
"rewards/rejected": -1.0893189907073975,
"step": 2910
},
{
"epoch": 0.25640462757666893,
"grad_norm": 7.946182727813721,
"learning_rate": 7.507407407407407e-06,
"logits/chosen": 2.081207275390625,
"logits/rejected": 2.123415946960449,
"logps/chosen": -0.6722933053970337,
"logps/rejected": -3.213026762008667,
"loss": 0.6752,
"nll_loss": 0.09485231339931488,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.2016880065202713,
"rewards/margins": 0.762220025062561,
"rewards/rejected": -0.9639080762863159,
"step": 2920
},
{
"epoch": 0.25728272561631504,
"grad_norm": 3.1857895851135254,
"learning_rate": 7.495061728395062e-06,
"logits/chosen": 2.0671546459198,
"logits/rejected": 2.1429953575134277,
"logps/chosen": -1.1341451406478882,
"logps/rejected": -3.0532522201538086,
"loss": 0.7432,
"nll_loss": 0.07530729472637177,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.34024354815483093,
"rewards/margins": 0.5757321119308472,
"rewards/rejected": -0.9159756898880005,
"step": 2930
},
{
"epoch": 0.2581608236559612,
"grad_norm": 5.178243637084961,
"learning_rate": 7.482716049382717e-06,
"logits/chosen": 2.082383394241333,
"logits/rejected": 2.076977252960205,
"logps/chosen": -0.3990221321582794,
"logps/rejected": -1.7076250314712524,
"loss": 0.6855,
"nll_loss": 0.06153715401887894,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.1197066530585289,
"rewards/margins": 0.39258089661598206,
"rewards/rejected": -0.5122874975204468,
"step": 2940
},
{
"epoch": 0.2590389216956073,
"grad_norm": 1.9402194023132324,
"learning_rate": 7.4703703703703715e-06,
"logits/chosen": 2.0955326557159424,
"logits/rejected": 2.119300603866577,
"logps/chosen": -0.48326557874679565,
"logps/rejected": -2.141892194747925,
"loss": 0.6575,
"nll_loss": 0.05360071733593941,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.14497968554496765,
"rewards/margins": 0.49758806824684143,
"rewards/rejected": -0.6425677537918091,
"step": 2950
},
{
"epoch": 0.25991701973525344,
"grad_norm": 2.156696081161499,
"learning_rate": 7.458024691358026e-06,
"logits/chosen": 2.2110159397125244,
"logits/rejected": 2.210599422454834,
"logps/chosen": -0.7016893625259399,
"logps/rejected": -1.9251207113265991,
"loss": 0.7172,
"nll_loss": 0.06567586958408356,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.21050682663917542,
"rewards/margins": 0.36702945828437805,
"rewards/rejected": -0.5775362253189087,
"step": 2960
},
{
"epoch": 0.26079511777489955,
"grad_norm": 4.643383026123047,
"learning_rate": 7.44567901234568e-06,
"logits/chosen": 2.3343098163604736,
"logits/rejected": 2.3111183643341064,
"logps/chosen": -0.546606183052063,
"logps/rejected": -1.9335724115371704,
"loss": 0.6684,
"nll_loss": 0.04829854518175125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1639818698167801,
"rewards/margins": 0.4160899519920349,
"rewards/rejected": -0.5800718069076538,
"step": 2970
},
{
"epoch": 0.26167321581454567,
"grad_norm": 3.739795684814453,
"learning_rate": 7.433333333333334e-06,
"logits/chosen": 2.259247303009033,
"logits/rejected": 2.3139965534210205,
"logps/chosen": -0.6394690871238708,
"logps/rejected": -3.2223987579345703,
"loss": 0.6667,
"nll_loss": 0.07620217651128769,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1918407380580902,
"rewards/margins": 0.774878978729248,
"rewards/rejected": -0.9667198061943054,
"step": 2980
},
{
"epoch": 0.26255131385419184,
"grad_norm": 3.2304673194885254,
"learning_rate": 7.420987654320988e-06,
"logits/chosen": 2.221369504928589,
"logits/rejected": 2.2490549087524414,
"logps/chosen": -0.3039132356643677,
"logps/rejected": -3.022956371307373,
"loss": 0.5563,
"nll_loss": 0.030767951160669327,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09117396920919418,
"rewards/margins": 0.8157129287719727,
"rewards/rejected": -0.9068870544433594,
"step": 2990
},
{
"epoch": 0.26342941189383795,
"grad_norm": 1.1860368251800537,
"learning_rate": 7.408641975308643e-06,
"logits/chosen": 2.0464189052581787,
"logits/rejected": 2.0965323448181152,
"logps/chosen": -0.32864516973495483,
"logps/rejected": -2.536147356033325,
"loss": 0.617,
"nll_loss": 0.044617362320423126,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.09859354794025421,
"rewards/margins": 0.6622506380081177,
"rewards/rejected": -0.7608442306518555,
"step": 3000
},
{
"epoch": 0.26430750993348406,
"grad_norm": 1.1873282194137573,
"learning_rate": 7.396296296296297e-06,
"logits/chosen": 2.152050256729126,
"logits/rejected": 2.2446908950805664,
"logps/chosen": -0.4739529490470886,
"logps/rejected": -3.3722312450408936,
"loss": 0.6149,
"nll_loss": 0.06702496111392975,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.14218589663505554,
"rewards/margins": 0.8694835901260376,
"rewards/rejected": -1.011669397354126,
"step": 3010
},
{
"epoch": 0.2651856079731302,
"grad_norm": 4.482595920562744,
"learning_rate": 7.3839506172839516e-06,
"logits/chosen": 2.094207286834717,
"logits/rejected": 2.200845241546631,
"logps/chosen": -1.2236783504486084,
"logps/rejected": -4.37764835357666,
"loss": 0.6561,
"nll_loss": 0.09786146134138107,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3671035170555115,
"rewards/margins": 0.9461910128593445,
"rewards/rejected": -1.3132946491241455,
"step": 3020
},
{
"epoch": 0.26606370601277635,
"grad_norm": 0.8290062546730042,
"learning_rate": 7.371604938271606e-06,
"logits/chosen": 1.9782556295394897,
"logits/rejected": 2.009742021560669,
"logps/chosen": -0.425149142742157,
"logps/rejected": -3.8111705780029297,
"loss": 0.5745,
"nll_loss": 0.042985234409570694,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.12754476070404053,
"rewards/margins": 1.0158064365386963,
"rewards/rejected": -1.1433511972427368,
"step": 3030
},
{
"epoch": 0.26694180405242246,
"grad_norm": 7.664605140686035,
"learning_rate": 7.3592592592592595e-06,
"logits/chosen": 2.083406448364258,
"logits/rejected": 2.1995325088500977,
"logps/chosen": -1.0140199661254883,
"logps/rejected": -3.976250410079956,
"loss": 0.759,
"nll_loss": 0.07868107408285141,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3042060136795044,
"rewards/margins": 0.8886691927909851,
"rewards/rejected": -1.1928752660751343,
"step": 3040
},
{
"epoch": 0.2678199020920686,
"grad_norm": 5.215522766113281,
"learning_rate": 7.346913580246914e-06,
"logits/chosen": 2.006986141204834,
"logits/rejected": 2.1079816818237305,
"logps/chosen": -0.6254408955574036,
"logps/rejected": -2.870253801345825,
"loss": 0.6843,
"nll_loss": 0.07333754748106003,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.1876322478055954,
"rewards/margins": 0.6734437346458435,
"rewards/rejected": -0.8610760569572449,
"step": 3050
},
{
"epoch": 0.2686980001317147,
"grad_norm": 1.4749705791473389,
"learning_rate": 7.334567901234568e-06,
"logits/chosen": 2.1230270862579346,
"logits/rejected": 2.2760751247406006,
"logps/chosen": -0.9829801321029663,
"logps/rejected": -2.8155903816223145,
"loss": 0.7342,
"nll_loss": 0.10188277065753937,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2948940396308899,
"rewards/margins": 0.5497831702232361,
"rewards/rejected": -0.8446771502494812,
"step": 3060
},
{
"epoch": 0.26957609817136086,
"grad_norm": 9.571310043334961,
"learning_rate": 7.322222222222223e-06,
"logits/chosen": 2.225337028503418,
"logits/rejected": 2.2470784187316895,
"logps/chosen": -0.7389670014381409,
"logps/rejected": -2.8324790000915527,
"loss": 0.6649,
"nll_loss": 0.0714489072561264,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2216901034116745,
"rewards/margins": 0.628053605556488,
"rewards/rejected": -0.8497437238693237,
"step": 3070
},
{
"epoch": 0.27045419621100697,
"grad_norm": 1.4797980785369873,
"learning_rate": 7.309876543209877e-06,
"logits/chosen": 2.0399069786071777,
"logits/rejected": 2.1479554176330566,
"logps/chosen": -0.7341340780258179,
"logps/rejected": -2.218003749847412,
"loss": 0.7062,
"nll_loss": 0.057169754058122635,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.22024023532867432,
"rewards/margins": 0.4451608657836914,
"rewards/rejected": -0.6654011011123657,
"step": 3080
},
{
"epoch": 0.2713322942506531,
"grad_norm": 11.532062530517578,
"learning_rate": 7.297530864197532e-06,
"logits/chosen": 2.1539671421051025,
"logits/rejected": 2.135166645050049,
"logps/chosen": -0.9680719375610352,
"logps/rejected": -2.85876202583313,
"loss": 0.7423,
"nll_loss": 0.0898386538028717,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2904215455055237,
"rewards/margins": 0.5672070980072021,
"rewards/rejected": -0.8576286435127258,
"step": 3090
},
{
"epoch": 0.2722103922902992,
"grad_norm": 0.5979923605918884,
"learning_rate": 7.285185185185186e-06,
"logits/chosen": 2.2366061210632324,
"logits/rejected": 2.297550916671753,
"logps/chosen": -0.9880453944206238,
"logps/rejected": -3.887558698654175,
"loss": 0.6517,
"nll_loss": 0.08710993081331253,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2964136004447937,
"rewards/margins": 0.8698541522026062,
"rewards/rejected": -1.1662677526474,
"step": 3100
},
{
"epoch": 0.27308849032994537,
"grad_norm": 6.401426792144775,
"learning_rate": 7.27283950617284e-06,
"logits/chosen": 2.1483216285705566,
"logits/rejected": 2.219287395477295,
"logps/chosen": -0.8323481678962708,
"logps/rejected": -1.9748185873031616,
"loss": 0.7784,
"nll_loss": 0.11523783206939697,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2497044801712036,
"rewards/margins": 0.34274110198020935,
"rewards/rejected": -0.5924455523490906,
"step": 3110
},
{
"epoch": 0.2739665883695915,
"grad_norm": 2.1834819316864014,
"learning_rate": 7.260493827160494e-06,
"logits/chosen": 2.21921968460083,
"logits/rejected": 2.247816801071167,
"logps/chosen": -0.8979324102401733,
"logps/rejected": -2.4821817874908447,
"loss": 0.753,
"nll_loss": 0.09049418568611145,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.26937970519065857,
"rewards/margins": 0.47527486085891724,
"rewards/rejected": -0.7446545362472534,
"step": 3120
},
{
"epoch": 0.2748446864092376,
"grad_norm": 2.105536937713623,
"learning_rate": 7.2481481481481485e-06,
"logits/chosen": 2.3172378540039062,
"logits/rejected": 2.3811116218566895,
"logps/chosen": -0.33308374881744385,
"logps/rejected": -1.8823559284210205,
"loss": 0.6691,
"nll_loss": 0.04073493555188179,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.09992513060569763,
"rewards/margins": 0.4647817015647888,
"rewards/rejected": -0.5647068023681641,
"step": 3130
},
{
"epoch": 0.2757227844488837,
"grad_norm": 5.6695556640625,
"learning_rate": 7.235802469135803e-06,
"logits/chosen": 2.484741687774658,
"logits/rejected": 2.524019718170166,
"logps/chosen": -0.6708775758743286,
"logps/rejected": -2.733649730682373,
"loss": 0.675,
"nll_loss": 0.06934002041816711,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.20126327872276306,
"rewards/margins": 0.6188317537307739,
"rewards/rejected": -0.8200949430465698,
"step": 3140
},
{
"epoch": 0.2766008824885298,
"grad_norm": 0.11841005086898804,
"learning_rate": 7.223456790123457e-06,
"logits/chosen": 2.3487932682037354,
"logits/rejected": 2.4092326164245605,
"logps/chosen": -0.5150425434112549,
"logps/rejected": -1.9013131856918335,
"loss": 0.7062,
"nll_loss": 0.05403406545519829,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.15451276302337646,
"rewards/margins": 0.4158812463283539,
"rewards/rejected": -0.570393979549408,
"step": 3150
},
{
"epoch": 0.277478980528176,
"grad_norm": 2.9380528926849365,
"learning_rate": 7.211111111111112e-06,
"logits/chosen": 2.3656086921691895,
"logits/rejected": 2.4282584190368652,
"logps/chosen": -0.6937441825866699,
"logps/rejected": -2.480012893676758,
"loss": 0.701,
"nll_loss": 0.07696821540594101,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.20812325179576874,
"rewards/margins": 0.5358806848526001,
"rewards/rejected": -0.74400395154953,
"step": 3160
},
{
"epoch": 0.2783570785678221,
"grad_norm": 5.344911575317383,
"learning_rate": 7.198765432098766e-06,
"logits/chosen": 2.2785983085632324,
"logits/rejected": 2.3243610858917236,
"logps/chosen": -0.7472286224365234,
"logps/rejected": -1.1668407917022705,
"loss": 0.7779,
"nll_loss": 0.07855083793401718,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.22416862845420837,
"rewards/margins": 0.12588365375995636,
"rewards/rejected": -0.35005226731300354,
"step": 3170
},
{
"epoch": 0.2792351766074682,
"grad_norm": 0.625324547290802,
"learning_rate": 7.18641975308642e-06,
"logits/chosen": 2.2220301628112793,
"logits/rejected": 2.2504611015319824,
"logps/chosen": -0.7789251208305359,
"logps/rejected": -2.3314409255981445,
"loss": 0.7352,
"nll_loss": 0.06989389657974243,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.23367755115032196,
"rewards/margins": 0.4657546877861023,
"rewards/rejected": -0.6994322538375854,
"step": 3180
},
{
"epoch": 0.28011327464711433,
"grad_norm": 2.6372480392456055,
"learning_rate": 7.174074074074074e-06,
"logits/chosen": 2.337601900100708,
"logits/rejected": 2.3855247497558594,
"logps/chosen": -0.4849260747432709,
"logps/rejected": -2.1613705158233643,
"loss": 0.6273,
"nll_loss": 0.052817367017269135,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.14547783136367798,
"rewards/margins": 0.5029333829879761,
"rewards/rejected": -0.6484112739562988,
"step": 3190
},
{
"epoch": 0.2809913726867605,
"grad_norm": 2.161203145980835,
"learning_rate": 7.1617283950617285e-06,
"logits/chosen": 2.1061933040618896,
"logits/rejected": 2.1634392738342285,
"logps/chosen": -0.5152336955070496,
"logps/rejected": -2.2968502044677734,
"loss": 0.6311,
"nll_loss": 0.04893555864691734,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.15457013249397278,
"rewards/margins": 0.5344849824905396,
"rewards/rejected": -0.6890550851821899,
"step": 3200
}
],
"logging_steps": 10,
"max_steps": 9000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}