simpo_r80 / checkpoint-64 /trainer_state.json
lzc0525's picture
Upload folder using huggingface_hub
c9d7a3f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1353154938883383,
"eval_steps": 500,
"global_step": 64,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002114304592005286,
"grad_norm": 1.5018059015274048,
"learning_rate": 2.083333333333333e-08,
"logits/chosen": -0.3466828167438507,
"logits/rejected": -0.30099987983703613,
"logps/chosen": -0.9345186948776245,
"logps/rejected": -0.9117153882980347,
"loss": 1.4889,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.869037389755249,
"rewards/margins": -0.04560665041208267,
"rewards/rejected": -1.8234307765960693,
"step": 1
},
{
"epoch": 0.004228609184010572,
"grad_norm": 0.8093975186347961,
"learning_rate": 4.166666666666666e-08,
"logits/chosen": -0.4310421049594879,
"logits/rejected": -0.39132067561149597,
"logps/chosen": -0.8198825716972351,
"logps/rejected": -0.8644211888313293,
"loss": 1.376,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.6397651433944702,
"rewards/margins": 0.08907715976238251,
"rewards/rejected": -1.7288423776626587,
"step": 2
},
{
"epoch": 0.006342913776015857,
"grad_norm": 0.5377389788627625,
"learning_rate": 6.25e-08,
"logits/chosen": -0.46692028641700745,
"logits/rejected": -0.4649256467819214,
"logps/chosen": -0.9087910652160645,
"logps/rejected": -0.9648240804672241,
"loss": 1.3404,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.817582130432129,
"rewards/margins": 0.11206617206335068,
"rewards/rejected": -1.9296481609344482,
"step": 3
},
{
"epoch": 0.008457218368021144,
"grad_norm": 0.3221875727176666,
"learning_rate": 8.333333333333333e-08,
"logits/chosen": -0.416828453540802,
"logits/rejected": -0.3584724962711334,
"logps/chosen": -0.7818898558616638,
"logps/rejected": -0.8170815110206604,
"loss": 1.3806,
"rewards/accuracies": 0.484375,
"rewards/chosen": -1.5637797117233276,
"rewards/margins": 0.07038339227437973,
"rewards/rejected": -1.6341630220413208,
"step": 4
},
{
"epoch": 0.010571522960026428,
"grad_norm": 0.64655601978302,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -0.376886248588562,
"logits/rejected": -0.3516141474246979,
"logps/chosen": -0.8814125061035156,
"logps/rejected": -1.0214396715164185,
"loss": 1.2741,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7628250122070312,
"rewards/margins": 0.28005433082580566,
"rewards/rejected": -2.042879343032837,
"step": 5
},
{
"epoch": 0.012685827552031714,
"grad_norm": 0.4775894582271576,
"learning_rate": 1.25e-07,
"logits/chosen": -0.4757865369319916,
"logits/rejected": -0.4498941898345947,
"logps/chosen": -0.8962199687957764,
"logps/rejected": -0.9462199807167053,
"loss": 1.364,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.7924399375915527,
"rewards/margins": 0.10000008344650269,
"rewards/rejected": -1.8924399614334106,
"step": 6
},
{
"epoch": 0.014800132144037,
"grad_norm": 1.2459568977355957,
"learning_rate": 1.4583333333333335e-07,
"logits/chosen": -0.38895344734191895,
"logits/rejected": -0.38165366649627686,
"logps/chosen": -0.9025766253471375,
"logps/rejected": -0.9465017318725586,
"loss": 1.3898,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.805153250694275,
"rewards/margins": 0.0878501906991005,
"rewards/rejected": -1.8930034637451172,
"step": 7
},
{
"epoch": 0.016914436736042288,
"grad_norm": 0.6195729374885559,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -0.3964853286743164,
"logits/rejected": -0.377862811088562,
"logps/chosen": -0.9054160118103027,
"logps/rejected": -0.9605879187583923,
"loss": 1.3821,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.8108320236206055,
"rewards/margins": 0.1103438138961792,
"rewards/rejected": -1.9211758375167847,
"step": 8
},
{
"epoch": 0.019028741328047574,
"grad_norm": 1.2074137926101685,
"learning_rate": 1.875e-07,
"logits/chosen": -0.3729037344455719,
"logits/rejected": -0.38143450021743774,
"logps/chosen": -0.9328653216362,
"logps/rejected": -0.9905799627304077,
"loss": 1.3754,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.8657306432724,
"rewards/margins": 0.11542946100234985,
"rewards/rejected": -1.9811599254608154,
"step": 9
},
{
"epoch": 0.021143045920052856,
"grad_norm": 0.2867220640182495,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.4263336658477783,
"logits/rejected": -0.42903271317481995,
"logps/chosen": -0.8979260325431824,
"logps/rejected": -0.9078099727630615,
"loss": 1.4438,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.7958520650863647,
"rewards/margins": 0.019767940044403076,
"rewards/rejected": -1.815619945526123,
"step": 10
},
{
"epoch": 0.023257350512058142,
"grad_norm": 0.8363026976585388,
"learning_rate": 2.2916666666666663e-07,
"logits/chosen": -0.3374914526939392,
"logits/rejected": -0.32399696111679077,
"logps/chosen": -0.8886098861694336,
"logps/rejected": -0.9484556317329407,
"loss": 1.3422,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.7772197723388672,
"rewards/margins": 0.11969132721424103,
"rewards/rejected": -1.8969112634658813,
"step": 11
},
{
"epoch": 0.025371655104063428,
"grad_norm": 0.5406804084777832,
"learning_rate": 2.5e-07,
"logits/chosen": -0.42844679951667786,
"logits/rejected": -0.37984615564346313,
"logps/chosen": -0.861629843711853,
"logps/rejected": -0.8968492150306702,
"loss": 1.3922,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.723259687423706,
"rewards/margins": 0.07043875753879547,
"rewards/rejected": -1.7936984300613403,
"step": 12
},
{
"epoch": 0.027485959696068714,
"grad_norm": 0.9919329285621643,
"learning_rate": 2.708333333333333e-07,
"logits/chosen": -0.36495402455329895,
"logits/rejected": -0.3249490261077881,
"logps/chosen": -0.8502095937728882,
"logps/rejected": -0.8470643758773804,
"loss": 1.4334,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7004191875457764,
"rewards/margins": -0.006290358491241932,
"rewards/rejected": -1.6941287517547607,
"step": 13
},
{
"epoch": 0.029600264288074,
"grad_norm": 0.5477162003517151,
"learning_rate": 2.916666666666667e-07,
"logits/chosen": -0.4155704081058502,
"logits/rejected": -0.39535820484161377,
"logps/chosen": -1.0430240631103516,
"logps/rejected": -1.1318373680114746,
"loss": 1.3533,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -2.086048126220703,
"rewards/margins": 0.17762640118598938,
"rewards/rejected": -2.263674736022949,
"step": 14
},
{
"epoch": 0.031714568880079286,
"grad_norm": 0.26530712842941284,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.4810572564601898,
"logits/rejected": -0.42454615235328674,
"logps/chosen": -0.8741041421890259,
"logps/rejected": -0.9494178295135498,
"loss": 1.3655,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7482082843780518,
"rewards/margins": 0.15062758326530457,
"rewards/rejected": -1.8988356590270996,
"step": 15
},
{
"epoch": 0.033828873472084575,
"grad_norm": 0.9272629618644714,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -0.4440098702907562,
"logits/rejected": -0.3930297791957855,
"logps/chosen": -0.8473359942436218,
"logps/rejected": -0.9369213581085205,
"loss": 1.3248,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.6946719884872437,
"rewards/margins": 0.17917080223560333,
"rewards/rejected": -1.873842716217041,
"step": 16
},
{
"epoch": 0.03594317806408986,
"grad_norm": 0.5912418961524963,
"learning_rate": 3.541666666666667e-07,
"logits/chosen": -0.3838099539279938,
"logits/rejected": -0.3507584035396576,
"logps/chosen": -0.8888350129127502,
"logps/rejected": -0.9361770749092102,
"loss": 1.383,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.7776700258255005,
"rewards/margins": 0.0946839451789856,
"rewards/rejected": -1.8723541498184204,
"step": 17
},
{
"epoch": 0.03805748265609515,
"grad_norm": 0.6536504030227661,
"learning_rate": 3.75e-07,
"logits/chosen": -0.3581697940826416,
"logits/rejected": -0.3620460629463196,
"logps/chosen": -0.8519617319107056,
"logps/rejected": -0.9022184610366821,
"loss": 1.3841,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.7039234638214111,
"rewards/margins": 0.10051343590021133,
"rewards/rejected": -1.8044369220733643,
"step": 18
},
{
"epoch": 0.04017178724810043,
"grad_norm": 0.3433632552623749,
"learning_rate": 3.958333333333333e-07,
"logits/chosen": -0.37887442111968994,
"logits/rejected": -0.37543320655822754,
"logps/chosen": -0.9464104175567627,
"logps/rejected": -1.0017329454421997,
"loss": 1.3649,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.8928208351135254,
"rewards/margins": 0.11064518243074417,
"rewards/rejected": -2.0034658908843994,
"step": 19
},
{
"epoch": 0.04228609184010571,
"grad_norm": 0.9764007329940796,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.44110679626464844,
"logits/rejected": -0.4280649721622467,
"logps/chosen": -0.9046768546104431,
"logps/rejected": -1.0464633703231812,
"loss": 1.2592,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.8093537092208862,
"rewards/margins": 0.2835729420185089,
"rewards/rejected": -2.0929267406463623,
"step": 20
},
{
"epoch": 0.044400396432111,
"grad_norm": 1.8563830852508545,
"learning_rate": 4.375e-07,
"logits/chosen": -0.45183491706848145,
"logits/rejected": -0.42935287952423096,
"logps/chosen": -0.9043138027191162,
"logps/rejected": -0.9462392926216125,
"loss": 1.3784,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8086276054382324,
"rewards/margins": 0.08385094255208969,
"rewards/rejected": -1.892478585243225,
"step": 21
},
{
"epoch": 0.046514701024116284,
"grad_norm": 1.3473299741744995,
"learning_rate": 4.5833333333333327e-07,
"logits/chosen": -0.37855517864227295,
"logits/rejected": -0.34429043531417847,
"logps/chosen": -0.9284683465957642,
"logps/rejected": -0.9454050064086914,
"loss": 1.4346,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.8569366931915283,
"rewards/margins": 0.03387312963604927,
"rewards/rejected": -1.8908100128173828,
"step": 22
},
{
"epoch": 0.04862900561612157,
"grad_norm": 0.940831184387207,
"learning_rate": 4.791666666666667e-07,
"logits/chosen": -0.39172160625457764,
"logits/rejected": -0.3695780634880066,
"logps/chosen": -0.9314202666282654,
"logps/rejected": -1.020229697227478,
"loss": 1.3322,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.8628405332565308,
"rewards/margins": 0.17761869728565216,
"rewards/rejected": -2.040459394454956,
"step": 23
},
{
"epoch": 0.050743310208126856,
"grad_norm": 0.5783158540725708,
"learning_rate": 5e-07,
"logits/chosen": -0.4958629608154297,
"logits/rejected": -0.4257377088069916,
"logps/chosen": -0.9379237294197083,
"logps/rejected": -0.9415461421012878,
"loss": 1.441,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.8758474588394165,
"rewards/margins": 0.0072449808940291405,
"rewards/rejected": -1.8830922842025757,
"step": 24
},
{
"epoch": 0.052857614800132145,
"grad_norm": 1.4209853410720825,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -0.36407172679901123,
"logits/rejected": -0.3331725299358368,
"logps/chosen": -0.9192589521408081,
"logps/rejected": -0.9595308899879456,
"loss": 1.3994,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.8385179042816162,
"rewards/margins": 0.080544114112854,
"rewards/rejected": -1.9190617799758911,
"step": 25
},
{
"epoch": 0.05497191939213743,
"grad_norm": 0.6310216188430786,
"learning_rate": 5.416666666666666e-07,
"logits/chosen": -0.41772690415382385,
"logits/rejected": -0.36565953493118286,
"logps/chosen": -0.8052878379821777,
"logps/rejected": -0.8673746585845947,
"loss": 1.3356,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -1.6105756759643555,
"rewards/margins": 0.12417369335889816,
"rewards/rejected": -1.7347493171691895,
"step": 26
},
{
"epoch": 0.05708622398414272,
"grad_norm": 1.2933462858200073,
"learning_rate": 5.625e-07,
"logits/chosen": -0.4482795000076294,
"logits/rejected": -0.39409321546554565,
"logps/chosen": -0.8339261412620544,
"logps/rejected": -0.8675202131271362,
"loss": 1.3739,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.6678522825241089,
"rewards/margins": 0.06718815863132477,
"rewards/rejected": -1.7350404262542725,
"step": 27
},
{
"epoch": 0.059200528576148,
"grad_norm": 0.5808025002479553,
"learning_rate": 5.833333333333334e-07,
"logits/chosen": -0.37116044759750366,
"logits/rejected": -0.3478051722049713,
"logps/chosen": -0.8950318694114685,
"logps/rejected": -0.9756672978401184,
"loss": 1.3505,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.790063738822937,
"rewards/margins": 0.1612708568572998,
"rewards/rejected": -1.9513345956802368,
"step": 28
},
{
"epoch": 0.06131483316815329,
"grad_norm": 1.0569533109664917,
"learning_rate": 6.041666666666666e-07,
"logits/chosen": -0.421148419380188,
"logits/rejected": -0.38443076610565186,
"logps/chosen": -0.8021283745765686,
"logps/rejected": -0.8370179533958435,
"loss": 1.3916,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.6042567491531372,
"rewards/margins": 0.06977920234203339,
"rewards/rejected": -1.674035906791687,
"step": 29
},
{
"epoch": 0.06342913776015857,
"grad_norm": 0.42577147483825684,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": -0.4429818391799927,
"logits/rejected": -0.3524704575538635,
"logps/chosen": -0.8916822671890259,
"logps/rejected": -0.8985542058944702,
"loss": 1.4321,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -1.7833645343780518,
"rewards/margins": 0.01374388113617897,
"rewards/rejected": -1.7971084117889404,
"step": 30
},
{
"epoch": 0.06554344235216386,
"grad_norm": 1.0056904554367065,
"learning_rate": 6.458333333333333e-07,
"logits/chosen": -0.376451700925827,
"logits/rejected": -0.342519074678421,
"logps/chosen": -0.9038617014884949,
"logps/rejected": -0.953092634677887,
"loss": 1.398,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.8077234029769897,
"rewards/margins": 0.09846188127994537,
"rewards/rejected": -1.906185269355774,
"step": 31
},
{
"epoch": 0.06765774694416915,
"grad_norm": 0.5494012236595154,
"learning_rate": 6.666666666666666e-07,
"logits/chosen": -0.3459138870239258,
"logits/rejected": -0.3590989410877228,
"logps/chosen": -0.8274999260902405,
"logps/rejected": -0.8776509761810303,
"loss": 1.363,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.654999852180481,
"rewards/margins": 0.1003020703792572,
"rewards/rejected": -1.7553019523620605,
"step": 32
},
{
"epoch": 0.06977205153617443,
"grad_norm": 0.693267822265625,
"learning_rate": 6.875e-07,
"logits/chosen": -0.40053680539131165,
"logits/rejected": -0.37323904037475586,
"logps/chosen": -0.8255244493484497,
"logps/rejected": -0.8658804893493652,
"loss": 1.3712,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6510488986968994,
"rewards/margins": 0.08071210980415344,
"rewards/rejected": -1.7317609786987305,
"step": 33
},
{
"epoch": 0.07188635612817972,
"grad_norm": 2.213238000869751,
"learning_rate": 7.083333333333334e-07,
"logits/chosen": -0.40097948908805847,
"logits/rejected": -0.38190510869026184,
"logps/chosen": -0.9122671484947205,
"logps/rejected": -0.9549552798271179,
"loss": 1.36,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.824534296989441,
"rewards/margins": 0.0853763073682785,
"rewards/rejected": -1.9099105596542358,
"step": 34
},
{
"epoch": 0.074000660720185,
"grad_norm": 0.6859830021858215,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": -0.42501094937324524,
"logits/rejected": -0.42549416422843933,
"logps/chosen": -1.0008373260498047,
"logps/rejected": -1.1157118082046509,
"loss": 1.3294,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -2.0016746520996094,
"rewards/margins": 0.22974897921085358,
"rewards/rejected": -2.2314236164093018,
"step": 35
},
{
"epoch": 0.0761149653121903,
"grad_norm": 0.6468721628189087,
"learning_rate": 7.5e-07,
"logits/chosen": -0.36494994163513184,
"logits/rejected": -0.30433908104896545,
"logps/chosen": -0.9062094688415527,
"logps/rejected": -0.920263409614563,
"loss": 1.4312,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -1.8124189376831055,
"rewards/margins": 0.02810765616595745,
"rewards/rejected": -1.840526819229126,
"step": 36
},
{
"epoch": 0.07822926990419557,
"grad_norm": 0.5085556507110596,
"learning_rate": 7.708333333333333e-07,
"logits/chosen": -0.4677881598472595,
"logits/rejected": -0.456132709980011,
"logps/chosen": -1.0101865530014038,
"logps/rejected": -1.0429682731628418,
"loss": 1.4132,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -2.0203731060028076,
"rewards/margins": 0.06556359678506851,
"rewards/rejected": -2.0859365463256836,
"step": 37
},
{
"epoch": 0.08034357449620086,
"grad_norm": 0.23813335597515106,
"learning_rate": 7.916666666666666e-07,
"logits/chosen": -0.3991190791130066,
"logits/rejected": -0.3664044141769409,
"logps/chosen": -0.9578174352645874,
"logps/rejected": -0.9229263067245483,
"loss": 1.4824,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -1.9156348705291748,
"rewards/margins": -0.06978224962949753,
"rewards/rejected": -1.8458526134490967,
"step": 38
},
{
"epoch": 0.08245787908820615,
"grad_norm": 0.587037980556488,
"learning_rate": 8.125e-07,
"logits/chosen": -0.37554049491882324,
"logits/rejected": -0.36305734515190125,
"logps/chosen": -0.8503091931343079,
"logps/rejected": -0.864615261554718,
"loss": 1.4086,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -1.7006183862686157,
"rewards/margins": 0.028611989691853523,
"rewards/rejected": -1.729230523109436,
"step": 39
},
{
"epoch": 0.08457218368021142,
"grad_norm": 0.4172501862049103,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.4405443072319031,
"logits/rejected": -0.41723060607910156,
"logps/chosen": -0.8502858877182007,
"logps/rejected": -0.9114271402359009,
"loss": 1.3446,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.7005717754364014,
"rewards/margins": 0.12228240817785263,
"rewards/rejected": -1.8228542804718018,
"step": 40
},
{
"epoch": 0.08668648827221671,
"grad_norm": 0.9275372624397278,
"learning_rate": 8.541666666666666e-07,
"logits/chosen": -0.4200601577758789,
"logits/rejected": -0.3478623628616333,
"logps/chosen": -0.892408013343811,
"logps/rejected": -0.9276402592658997,
"loss": 1.3887,
"rewards/accuracies": 0.46875,
"rewards/chosen": -1.784816026687622,
"rewards/margins": 0.07046431303024292,
"rewards/rejected": -1.8552805185317993,
"step": 41
},
{
"epoch": 0.088800792864222,
"grad_norm": 0.7317383289337158,
"learning_rate": 8.75e-07,
"logits/chosen": -0.37675267457962036,
"logits/rejected": -0.33540332317352295,
"logps/chosen": -0.7866061925888062,
"logps/rejected": -0.824250340461731,
"loss": 1.3837,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.5732123851776123,
"rewards/margins": 0.07528844475746155,
"rewards/rejected": -1.648500680923462,
"step": 42
},
{
"epoch": 0.09091509745622729,
"grad_norm": 0.9452736973762512,
"learning_rate": 8.958333333333334e-07,
"logits/chosen": -0.4662383198738098,
"logits/rejected": -0.4447881579399109,
"logps/chosen": -0.9490666389465332,
"logps/rejected": -1.0112388134002686,
"loss": 1.3412,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.8981332778930664,
"rewards/margins": 0.12434446066617966,
"rewards/rejected": -2.022477626800537,
"step": 43
},
{
"epoch": 0.09302940204823257,
"grad_norm": 0.2848323881626129,
"learning_rate": 9.166666666666665e-07,
"logits/chosen": -0.41404005885124207,
"logits/rejected": -0.3944583535194397,
"logps/chosen": -0.8224930167198181,
"logps/rejected": -0.8416361808776855,
"loss": 1.4027,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.6449860334396362,
"rewards/margins": 0.038286346942186356,
"rewards/rejected": -1.683272361755371,
"step": 44
},
{
"epoch": 0.09514370664023786,
"grad_norm": 0.7165678143501282,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": -0.40475326776504517,
"logits/rejected": -0.3559921383857727,
"logps/chosen": -0.8070214986801147,
"logps/rejected": -0.8993593454360962,
"loss": 1.3148,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.6140429973602295,
"rewards/margins": 0.18467575311660767,
"rewards/rejected": -1.7987186908721924,
"step": 45
},
{
"epoch": 0.09725801123224315,
"grad_norm": 0.4779021739959717,
"learning_rate": 9.583333333333334e-07,
"logits/chosen": -0.4171525835990906,
"logits/rejected": -0.42166149616241455,
"logps/chosen": -0.7872560024261475,
"logps/rejected": -0.8496187925338745,
"loss": 1.3356,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.574512004852295,
"rewards/margins": 0.12472567707300186,
"rewards/rejected": -1.699237585067749,
"step": 46
},
{
"epoch": 0.09937231582424844,
"grad_norm": 0.7870219349861145,
"learning_rate": 9.791666666666667e-07,
"logits/chosen": -0.3734116554260254,
"logits/rejected": -0.32778748869895935,
"logps/chosen": -0.7842286825180054,
"logps/rejected": -0.8161548972129822,
"loss": 1.3647,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -1.5684573650360107,
"rewards/margins": 0.06385258585214615,
"rewards/rejected": -1.6323097944259644,
"step": 47
},
{
"epoch": 0.10148662041625371,
"grad_norm": 0.2597256600856781,
"learning_rate": 1e-06,
"logits/chosen": -0.4355677664279938,
"logits/rejected": -0.38983187079429626,
"logps/chosen": -0.8787693977355957,
"logps/rejected": -0.9383041262626648,
"loss": 1.35,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.7575387954711914,
"rewards/margins": 0.11906948685646057,
"rewards/rejected": -1.8766082525253296,
"step": 48
},
{
"epoch": 0.103600925008259,
"grad_norm": 0.9942799210548401,
"learning_rate": 9.999862751990697e-07,
"logits/chosen": -0.4244321882724762,
"logits/rejected": -0.4366786777973175,
"logps/chosen": -0.7910157442092896,
"logps/rejected": -0.8630884885787964,
"loss": 1.3166,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -1.582031488418579,
"rewards/margins": 0.14414538443088531,
"rewards/rejected": -1.7261769771575928,
"step": 49
},
{
"epoch": 0.10571522960026429,
"grad_norm": 0.5333903431892395,
"learning_rate": 9.999451015497595e-07,
"logits/chosen": -0.389942467212677,
"logits/rejected": -0.36674585938453674,
"logps/chosen": -0.7312074899673462,
"logps/rejected": -0.7289648652076721,
"loss": 1.4225,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -1.4624149799346924,
"rewards/margins": -0.004485193639993668,
"rewards/rejected": -1.4579297304153442,
"step": 50
},
{
"epoch": 0.10782953419226958,
"grad_norm": 0.5712242722511292,
"learning_rate": 9.9987648131247e-07,
"logits/chosen": -0.4622853994369507,
"logits/rejected": -0.3728552460670471,
"logps/chosen": -0.8764299750328064,
"logps/rejected": -0.869678795337677,
"loss": 1.4542,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.7528599500656128,
"rewards/margins": -0.013502337038516998,
"rewards/rejected": -1.739357590675354,
"step": 51
},
{
"epoch": 0.10994383878427486,
"grad_norm": 0.2586441934108734,
"learning_rate": 9.99780418254397e-07,
"logits/chosen": -0.37249019742012024,
"logits/rejected": -0.3998304605484009,
"logps/chosen": -0.8435611724853516,
"logps/rejected": -0.9359882473945618,
"loss": 1.3057,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6871223449707031,
"rewards/margins": 0.18485431373119354,
"rewards/rejected": -1.8719764947891235,
"step": 52
},
{
"epoch": 0.11205814337628014,
"grad_norm": 1.0829113721847534,
"learning_rate": 9.996569176493268e-07,
"logits/chosen": -0.47697725892066956,
"logits/rejected": -0.4208195209503174,
"logps/chosen": -0.8014968037605286,
"logps/rejected": -0.8703804612159729,
"loss": 1.3523,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.6029936075210571,
"rewards/margins": 0.1377674788236618,
"rewards/rejected": -1.7407609224319458,
"step": 53
},
{
"epoch": 0.11417244796828543,
"grad_norm": 0.5523208379745483,
"learning_rate": 9.995059862773438e-07,
"logits/chosen": -0.40533363819122314,
"logits/rejected": -0.36801978945732117,
"logps/chosen": -0.7641825675964355,
"logps/rejected": -0.8168596029281616,
"loss": 1.3692,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.528365135192871,
"rewards/margins": 0.10535416752099991,
"rewards/rejected": -1.6337192058563232,
"step": 54
},
{
"epoch": 0.11628675256029072,
"grad_norm": 0.614101767539978,
"learning_rate": 9.993276324244605e-07,
"logits/chosen": -0.4476906955242157,
"logits/rejected": -0.40396648645401,
"logps/chosen": -0.8706808090209961,
"logps/rejected": -0.9221430420875549,
"loss": 1.3787,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.7413616180419922,
"rewards/margins": 0.10292442888021469,
"rewards/rejected": -1.8442860841751099,
"step": 55
},
{
"epoch": 0.118401057152296,
"grad_norm": 0.3428778052330017,
"learning_rate": 9.991218658821608e-07,
"logits/chosen": -0.31709593534469604,
"logits/rejected": -0.2760937213897705,
"logps/chosen": -0.842248797416687,
"logps/rejected": -0.8068034648895264,
"loss": 1.498,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.684497594833374,
"rewards/margins": -0.07089066505432129,
"rewards/rejected": -1.6136069297790527,
"step": 56
},
{
"epoch": 0.12051536174430129,
"grad_norm": 0.6877723932266235,
"learning_rate": 9.988886979468643e-07,
"logits/chosen": -0.41800016164779663,
"logits/rejected": -0.4011584222316742,
"logps/chosen": -0.7845420837402344,
"logps/rejected": -0.834447979927063,
"loss": 1.3491,
"rewards/accuracies": 0.546875,
"rewards/chosen": -1.5690841674804688,
"rewards/margins": 0.09981165081262589,
"rewards/rejected": -1.668895959854126,
"step": 57
},
{
"epoch": 0.12262966633630658,
"grad_norm": 0.9649701714515686,
"learning_rate": 9.98628141419305e-07,
"logits/chosen": -0.4253537058830261,
"logits/rejected": -0.4305458962917328,
"logps/chosen": -0.86476731300354,
"logps/rejected": -0.9080386161804199,
"loss": 1.3639,
"rewards/accuracies": 0.515625,
"rewards/chosen": -1.72953462600708,
"rewards/margins": 0.08654248714447021,
"rewards/rejected": -1.8160772323608398,
"step": 58
},
{
"epoch": 0.12474397092831185,
"grad_norm": 1.3779780864715576,
"learning_rate": 9.98340210603829e-07,
"logits/chosen": -0.39970022439956665,
"logits/rejected": -0.441428005695343,
"logps/chosen": -0.8662775158882141,
"logps/rejected": -0.9646260738372803,
"loss": 1.3001,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -1.7325550317764282,
"rewards/margins": 0.19669723510742188,
"rewards/rejected": -1.9292521476745605,
"step": 59
},
{
"epoch": 0.12685827552031714,
"grad_norm": 0.5366966724395752,
"learning_rate": 9.980249213076084e-07,
"logits/chosen": -0.37770116329193115,
"logits/rejected": -0.35231757164001465,
"logps/chosen": -0.8165755867958069,
"logps/rejected": -0.8619179129600525,
"loss": 1.3699,
"rewards/accuracies": 0.578125,
"rewards/chosen": -1.6331511735916138,
"rewards/margins": 0.09068439900875092,
"rewards/rejected": -1.723835825920105,
"step": 60
},
{
"epoch": 0.12897258011232243,
"grad_norm": 0.36810922622680664,
"learning_rate": 9.976822908397748e-07,
"logits/chosen": -0.4224976897239685,
"logits/rejected": -0.41758257150650024,
"logps/chosen": -0.8445641994476318,
"logps/rejected": -0.9393664598464966,
"loss": 1.3193,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -1.6891283988952637,
"rewards/margins": 0.18960458040237427,
"rewards/rejected": -1.8787329196929932,
"step": 61
},
{
"epoch": 0.13108688470432772,
"grad_norm": 0.6838279366493225,
"learning_rate": 9.97312338010468e-07,
"logits/chosen": -0.4168627858161926,
"logits/rejected": -0.36115381121635437,
"logps/chosen": -0.8370552659034729,
"logps/rejected": -0.8352169394493103,
"loss": 1.4284,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -1.6741105318069458,
"rewards/margins": -0.0036766715347766876,
"rewards/rejected": -1.6704338788986206,
"step": 62
},
{
"epoch": 0.133201189296333,
"grad_norm": 0.39330533146858215,
"learning_rate": 9.969150831298037e-07,
"logits/chosen": -0.4558233618736267,
"logits/rejected": -0.4025765061378479,
"logps/chosen": -0.826255738735199,
"logps/rejected": -0.894213080406189,
"loss": 1.3485,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.652511477470398,
"rewards/margins": 0.13591471314430237,
"rewards/rejected": -1.788426160812378,
"step": 63
},
{
"epoch": 0.1353154938883383,
"grad_norm": 0.6055929660797119,
"learning_rate": 9.964905480067584e-07,
"logits/chosen": -0.459463506937027,
"logits/rejected": -0.42943331599235535,
"logps/chosen": -0.7901928424835205,
"logps/rejected": -0.7964221239089966,
"loss": 1.4057,
"rewards/accuracies": 0.484375,
"rewards/chosen": -1.580385684967041,
"rewards/margins": 0.012458762153983116,
"rewards/rejected": -1.5928442478179932,
"step": 64
}
],
"logging_steps": 1,
"max_steps": 472,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}