qwen1_chat_reflct_adamw_iter6 / trainer_state.json
yiran-wang3's picture
End of training
59dfe77 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 64,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": -1.0596340894699097,
"debug/policy_chosen_logps": -179.04273986816406,
"debug/policy_rejected_logits": -1.1748394966125488,
"debug/policy_rejected_logps": -295.01690673828125,
"debug/reference_chosen_logps": -179.04273986816406,
"debug/reference_rejected_logps": -295.01690673828125,
"epoch": 0.015625,
"grad_norm": 52.30319105460711,
"learning_rate": 1e-06,
"logits/chosen": -1.0596340894699097,
"logits/rejected": -1.1748394966125488,
"logps/chosen": -179.04273986816406,
"logps/rejected": -295.01690673828125,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": -1.1150486469268799,
"debug/policy_chosen_logps": -124.63790893554688,
"debug/policy_rejected_logits": -1.0623761415481567,
"debug/policy_rejected_logps": -270.75244140625,
"debug/reference_chosen_logps": -125.14633178710938,
"debug/reference_rejected_logps": -271.20208740234375,
"epoch": 0.03125,
"grad_norm": 29.130704023833047,
"learning_rate": 1e-06,
"logits/chosen": -1.1150486469268799,
"logits/rejected": -1.0623761415481567,
"logps/chosen": -124.63790893554688,
"logps/rejected": -270.75244140625,
"loss": 0.4989,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.005084190517663956,
"rewards/margins": 0.0005879019154235721,
"rewards/rejected": 0.0044962880201637745,
"step": 2
},
{
"debug/policy_chosen_logits": -1.1071562767028809,
"debug/policy_chosen_logps": -136.3170166015625,
"debug/policy_rejected_logits": -1.1613606214523315,
"debug/policy_rejected_logps": -268.709228515625,
"debug/reference_chosen_logps": -137.68783569335938,
"debug/reference_rejected_logps": -268.8507995605469,
"epoch": 0.046875,
"grad_norm": 24.965184935253273,
"learning_rate": 1e-06,
"logits/chosen": -1.1071562767028809,
"logits/rejected": -1.1613606214523315,
"logps/chosen": -136.3170166015625,
"logps/rejected": -268.709228515625,
"loss": 0.4914,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.013708190061151981,
"rewards/margins": 0.012292098253965378,
"rewards/rejected": 0.0014160918071866035,
"step": 3
},
{
"debug/policy_chosen_logits": -1.066061019897461,
"debug/policy_chosen_logps": -153.8428192138672,
"debug/policy_rejected_logits": -1.1866570711135864,
"debug/policy_rejected_logps": -274.9277648925781,
"debug/reference_chosen_logps": -155.69000244140625,
"debug/reference_rejected_logps": -275.12884521484375,
"epoch": 0.0625,
"grad_norm": 24.49810670915077,
"learning_rate": 1e-06,
"logits/chosen": -1.066061019897461,
"logits/rejected": -1.1866570711135864,
"logps/chosen": -153.8428192138672,
"logps/rejected": -274.9277648925781,
"loss": 0.4777,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.018471689894795418,
"rewards/margins": 0.016461096704006195,
"rewards/rejected": 0.0020105931907892227,
"step": 4
},
{
"debug/policy_chosen_logits": -1.0783909559249878,
"debug/policy_chosen_logps": -161.8551483154297,
"debug/policy_rejected_logits": -1.1809625625610352,
"debug/policy_rejected_logps": -291.5763244628906,
"debug/reference_chosen_logps": -165.77706909179688,
"debug/reference_rejected_logps": -290.215087890625,
"epoch": 0.078125,
"grad_norm": 22.066344534464825,
"learning_rate": 1e-06,
"logits/chosen": -1.0783909559249878,
"logits/rejected": -1.1809625625610352,
"logps/chosen": -161.8551483154297,
"logps/rejected": -291.5763244628906,
"loss": 0.4425,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.039219196885824203,
"rewards/margins": 0.05283135548233986,
"rewards/rejected": -0.013612156733870506,
"step": 5
},
{
"debug/policy_chosen_logits": -1.0005463361740112,
"debug/policy_chosen_logps": -177.85003662109375,
"debug/policy_rejected_logits": -1.0288403034210205,
"debug/policy_rejected_logps": -263.21014404296875,
"debug/reference_chosen_logps": -178.246337890625,
"debug/reference_rejected_logps": -263.5099182128906,
"epoch": 0.09375,
"grad_norm": 41.16778948079108,
"learning_rate": 1e-06,
"logits/chosen": -1.0005463361740112,
"logits/rejected": -1.0288403034210205,
"logps/chosen": -177.85003662109375,
"logps/rejected": -263.21014404296875,
"loss": 0.4659,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.003962935879826546,
"rewards/margins": 0.0009648129343986511,
"rewards/rejected": 0.0029981210827827454,
"step": 6
},
{
"debug/policy_chosen_logits": -0.9317433834075928,
"debug/policy_chosen_logps": -155.7017822265625,
"debug/policy_rejected_logits": -1.3209773302078247,
"debug/policy_rejected_logps": -308.2155456542969,
"debug/reference_chosen_logps": -165.14569091796875,
"debug/reference_rejected_logps": -295.0081481933594,
"epoch": 0.109375,
"grad_norm": 15.904262612549944,
"learning_rate": 1e-06,
"logits/chosen": -0.9317433834075928,
"logits/rejected": -1.3209773302078247,
"logps/chosen": -155.7017822265625,
"logps/rejected": -308.2155456542969,
"loss": 0.4323,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0944390594959259,
"rewards/margins": 0.22651299834251404,
"rewards/rejected": -0.13207395374774933,
"step": 7
},
{
"debug/policy_chosen_logits": -1.0539729595184326,
"debug/policy_chosen_logps": -173.66781616210938,
"debug/policy_rejected_logits": -1.0206472873687744,
"debug/policy_rejected_logps": -271.9588317871094,
"debug/reference_chosen_logps": -177.30899047851562,
"debug/reference_rejected_logps": -263.9579162597656,
"epoch": 0.125,
"grad_norm": 15.028091497342194,
"learning_rate": 1e-06,
"logits/chosen": -1.0539729595184326,
"logits/rejected": -1.0206472873687744,
"logps/chosen": -173.66781616210938,
"logps/rejected": -271.9588317871094,
"loss": 0.4255,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.036411646753549576,
"rewards/margins": 0.1164209246635437,
"rewards/rejected": -0.08000928163528442,
"step": 8
},
{
"debug/policy_chosen_logits": -0.9866081476211548,
"debug/policy_chosen_logps": -176.56866455078125,
"debug/policy_rejected_logits": -0.9740838408470154,
"debug/policy_rejected_logps": -272.35650634765625,
"debug/reference_chosen_logps": -177.0741729736328,
"debug/reference_rejected_logps": -260.4818420410156,
"epoch": 0.140625,
"grad_norm": 32.8906220838234,
"learning_rate": 1e-06,
"logits/chosen": -0.9866081476211548,
"logits/rejected": -0.9740838408470154,
"logps/chosen": -176.56866455078125,
"logps/rejected": -272.35650634765625,
"loss": 0.4107,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.005055226851254702,
"rewards/margins": 0.12380212545394897,
"rewards/rejected": -0.11874689161777496,
"step": 9
},
{
"debug/policy_chosen_logits": -0.9441277980804443,
"debug/policy_chosen_logps": -164.24789428710938,
"debug/policy_rejected_logits": -1.1364271640777588,
"debug/policy_rejected_logps": -292.0938720703125,
"debug/reference_chosen_logps": -160.7564697265625,
"debug/reference_rejected_logps": -257.1752014160156,
"epoch": 0.15625,
"grad_norm": 28.811843166780264,
"learning_rate": 1e-06,
"logits/chosen": -0.9441277980804443,
"logits/rejected": -1.1364271640777588,
"logps/chosen": -164.24789428710938,
"logps/rejected": -292.0938720703125,
"loss": 0.4237,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.03491419926285744,
"rewards/margins": 0.3142724931240082,
"rewards/rejected": -0.3491867184638977,
"step": 10
},
{
"debug/policy_chosen_logits": -0.9175143837928772,
"debug/policy_chosen_logps": -214.65664672851562,
"debug/policy_rejected_logits": -1.1515822410583496,
"debug/policy_rejected_logps": -244.6530303955078,
"debug/reference_chosen_logps": -207.79930114746094,
"debug/reference_rejected_logps": -230.90333557128906,
"epoch": 0.171875,
"grad_norm": 38.46428758925275,
"learning_rate": 1e-06,
"logits/chosen": -0.9175143837928772,
"logits/rejected": -1.1515822410583496,
"logps/chosen": -214.65664672851562,
"logps/rejected": -244.6530303955078,
"loss": 0.4949,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.06857340782880783,
"rewards/margins": 0.06892354786396027,
"rewards/rejected": -0.1374969631433487,
"step": 11
},
{
"debug/policy_chosen_logits": -0.8965519070625305,
"debug/policy_chosen_logps": -153.26284790039062,
"debug/policy_rejected_logits": -1.1321805715560913,
"debug/policy_rejected_logps": -318.78076171875,
"debug/reference_chosen_logps": -154.14707946777344,
"debug/reference_rejected_logps": -289.067138671875,
"epoch": 0.1875,
"grad_norm": 41.433140559474445,
"learning_rate": 1e-06,
"logits/chosen": -0.8965519070625305,
"logits/rejected": -1.1321805715560913,
"logps/chosen": -153.26284790039062,
"logps/rejected": -318.78076171875,
"loss": 0.4907,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.008842326700687408,
"rewards/margins": 0.30597835779190063,
"rewards/rejected": -0.2971360683441162,
"step": 12
},
{
"debug/policy_chosen_logits": -0.945601761341095,
"debug/policy_chosen_logps": -122.90229797363281,
"debug/policy_rejected_logits": -1.0716924667358398,
"debug/policy_rejected_logps": -274.2931823730469,
"debug/reference_chosen_logps": -120.32145690917969,
"debug/reference_rejected_logps": -250.55557250976562,
"epoch": 0.203125,
"grad_norm": 26.79881614435138,
"learning_rate": 1e-06,
"logits/chosen": -0.945601761341095,
"logits/rejected": -1.0716924667358398,
"logps/chosen": -122.90229797363281,
"logps/rejected": -274.2931823730469,
"loss": 0.4694,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02580837532877922,
"rewards/margins": 0.2115677297115326,
"rewards/rejected": -0.2373761087656021,
"step": 13
},
{
"debug/policy_chosen_logits": -1.0047388076782227,
"debug/policy_chosen_logps": -200.4830780029297,
"debug/policy_rejected_logits": -1.1980981826782227,
"debug/policy_rejected_logps": -315.792236328125,
"debug/reference_chosen_logps": -190.80075073242188,
"debug/reference_rejected_logps": -281.5347595214844,
"epoch": 0.21875,
"grad_norm": 27.316365360407435,
"learning_rate": 1e-06,
"logits/chosen": -1.0047388076782227,
"logits/rejected": -1.1980981826782227,
"logps/chosen": -200.4830780029297,
"logps/rejected": -315.792236328125,
"loss": 0.4324,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09682333469390869,
"rewards/margins": 0.24575121700763702,
"rewards/rejected": -0.3425745368003845,
"step": 14
},
{
"debug/policy_chosen_logits": -1.1348706483840942,
"debug/policy_chosen_logps": -208.73074340820312,
"debug/policy_rejected_logits": -1.121549129486084,
"debug/policy_rejected_logps": -310.7353210449219,
"debug/reference_chosen_logps": -204.0843048095703,
"debug/reference_rejected_logps": -281.996337890625,
"epoch": 0.234375,
"grad_norm": 58.18504208169894,
"learning_rate": 1e-06,
"logits/chosen": -1.1348706483840942,
"logits/rejected": -1.121549129486084,
"logps/chosen": -208.73074340820312,
"logps/rejected": -310.7353210449219,
"loss": 0.4662,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.04646441712975502,
"rewards/margins": 0.24092541635036469,
"rewards/rejected": -0.2873898148536682,
"step": 15
},
{
"debug/policy_chosen_logits": -0.9974825978279114,
"debug/policy_chosen_logps": -154.0273895263672,
"debug/policy_rejected_logits": -1.1503194570541382,
"debug/policy_rejected_logps": -307.7276611328125,
"debug/reference_chosen_logps": -154.69586181640625,
"debug/reference_rejected_logps": -273.1531677246094,
"epoch": 0.25,
"grad_norm": 56.48600158612175,
"learning_rate": 1e-06,
"logits/chosen": -0.9974825978279114,
"logits/rejected": -1.1503194570541382,
"logps/chosen": -154.0273895263672,
"logps/rejected": -307.7276611328125,
"loss": 0.4093,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.006684892810881138,
"rewards/margins": 0.3524298071861267,
"rewards/rejected": -0.34574490785598755,
"step": 16
},
{
"debug/policy_chosen_logits": -1.0567247867584229,
"debug/policy_chosen_logps": -137.61720275878906,
"debug/policy_rejected_logits": -1.0961592197418213,
"debug/policy_rejected_logps": -313.12060546875,
"debug/reference_chosen_logps": -135.6652069091797,
"debug/reference_rejected_logps": -297.18695068359375,
"epoch": 0.265625,
"grad_norm": 31.458528575785774,
"learning_rate": 1e-06,
"logits/chosen": -1.0567247867584229,
"logits/rejected": -1.0961592197418213,
"logps/chosen": -137.61720275878906,
"logps/rejected": -313.12060546875,
"loss": 0.4473,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.01951989158987999,
"rewards/margins": 0.13981682062149048,
"rewards/rejected": -0.15933671593666077,
"step": 17
},
{
"debug/policy_chosen_logits": -1.0550764799118042,
"debug/policy_chosen_logps": -143.5434112548828,
"debug/policy_rejected_logits": -1.3183400630950928,
"debug/policy_rejected_logps": -359.35418701171875,
"debug/reference_chosen_logps": -157.90188598632812,
"debug/reference_rejected_logps": -317.474853515625,
"epoch": 0.28125,
"grad_norm": 24.003283570475016,
"learning_rate": 1e-06,
"logits/chosen": -1.0550764799118042,
"logits/rejected": -1.3183400630950928,
"logps/chosen": -143.5434112548828,
"logps/rejected": -359.35418701171875,
"loss": 0.4381,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14358465373516083,
"rewards/margins": 0.5623779296875,
"rewards/rejected": -0.418793261051178,
"step": 18
},
{
"debug/policy_chosen_logits": -1.1249719858169556,
"debug/policy_chosen_logps": -163.00375366210938,
"debug/policy_rejected_logits": -1.1258165836334229,
"debug/policy_rejected_logps": -283.2430725097656,
"debug/reference_chosen_logps": -166.72418212890625,
"debug/reference_rejected_logps": -264.2232360839844,
"epoch": 0.296875,
"grad_norm": 41.98096605753313,
"learning_rate": 1e-06,
"logits/chosen": -1.1249719858169556,
"logits/rejected": -1.1258165836334229,
"logps/chosen": -163.00375366210938,
"logps/rejected": -283.2430725097656,
"loss": 0.4597,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03720443695783615,
"rewards/margins": 0.22740286588668823,
"rewards/rejected": -0.19019843637943268,
"step": 19
},
{
"debug/policy_chosen_logits": -1.000652551651001,
"debug/policy_chosen_logps": -174.8540802001953,
"debug/policy_rejected_logits": -1.075732946395874,
"debug/policy_rejected_logps": -248.3970947265625,
"debug/reference_chosen_logps": -179.03424072265625,
"debug/reference_rejected_logps": -235.50778198242188,
"epoch": 0.3125,
"grad_norm": 26.892461198324778,
"learning_rate": 1e-06,
"logits/chosen": -1.000652551651001,
"logits/rejected": -1.075732946395874,
"logps/chosen": -174.8540802001953,
"logps/rejected": -248.3970947265625,
"loss": 0.4325,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.041801512241363525,
"rewards/margins": 0.1706947386264801,
"rewards/rejected": -0.12889322638511658,
"step": 20
},
{
"debug/policy_chosen_logits": -1.0587340593338013,
"debug/policy_chosen_logps": -148.18423461914062,
"debug/policy_rejected_logits": -1.435739278793335,
"debug/policy_rejected_logps": -331.1427001953125,
"debug/reference_chosen_logps": -151.2082061767578,
"debug/reference_rejected_logps": -314.77117919921875,
"epoch": 0.328125,
"grad_norm": 15.800648562809261,
"learning_rate": 1e-06,
"logits/chosen": -1.0587340593338013,
"logits/rejected": -1.435739278793335,
"logps/chosen": -148.18423461914062,
"logps/rejected": -331.1427001953125,
"loss": 0.3982,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.030239801853895187,
"rewards/margins": 0.1939551830291748,
"rewards/rejected": -0.16371536254882812,
"step": 21
},
{
"debug/policy_chosen_logits": -1.0205200910568237,
"debug/policy_chosen_logps": -157.31350708007812,
"debug/policy_rejected_logits": -1.0888888835906982,
"debug/policy_rejected_logps": -346.0768127441406,
"debug/reference_chosen_logps": -161.5574493408203,
"debug/reference_rejected_logps": -338.91650390625,
"epoch": 0.34375,
"grad_norm": 21.49065797596958,
"learning_rate": 1e-06,
"logits/chosen": -1.0205200910568237,
"logits/rejected": -1.0888888835906982,
"logps/chosen": -157.31350708007812,
"logps/rejected": -346.0768127441406,
"loss": 0.4361,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.042439430952072144,
"rewards/margins": 0.11404269933700562,
"rewards/rejected": -0.07160326838493347,
"step": 22
},
{
"debug/policy_chosen_logits": -1.1462302207946777,
"debug/policy_chosen_logps": -195.76788330078125,
"debug/policy_rejected_logits": -1.2484185695648193,
"debug/policy_rejected_logps": -277.576904296875,
"debug/reference_chosen_logps": -198.74685668945312,
"debug/reference_rejected_logps": -265.5393981933594,
"epoch": 0.359375,
"grad_norm": 17.749863549342045,
"learning_rate": 1e-06,
"logits/chosen": -1.1462302207946777,
"logits/rejected": -1.2484185695648193,
"logps/chosen": -195.76788330078125,
"logps/rejected": -277.576904296875,
"loss": 0.4165,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.029789581894874573,
"rewards/margins": 0.1501646637916565,
"rewards/rejected": -0.12037509679794312,
"step": 23
},
{
"debug/policy_chosen_logits": -0.952358067035675,
"debug/policy_chosen_logps": -115.6708984375,
"debug/policy_rejected_logits": -1.036898136138916,
"debug/policy_rejected_logps": -245.47000122070312,
"debug/reference_chosen_logps": -131.1976776123047,
"debug/reference_rejected_logps": -238.638427734375,
"epoch": 0.375,
"grad_norm": 16.031924320507283,
"learning_rate": 1e-06,
"logits/chosen": -0.952358067035675,
"logits/rejected": -1.036898136138916,
"logps/chosen": -115.6708984375,
"logps/rejected": -245.47000122070312,
"loss": 0.3771,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.1552678644657135,
"rewards/margins": 0.22358371317386627,
"rewards/rejected": -0.06831584870815277,
"step": 24
},
{
"debug/policy_chosen_logits": -1.070897102355957,
"debug/policy_chosen_logps": -178.87374877929688,
"debug/policy_rejected_logits": -1.1623822450637817,
"debug/policy_rejected_logps": -243.98184204101562,
"debug/reference_chosen_logps": -179.05862426757812,
"debug/reference_rejected_logps": -244.07818603515625,
"epoch": 0.390625,
"grad_norm": 38.66586744942012,
"learning_rate": 1e-06,
"logits/chosen": -1.070897102355957,
"logits/rejected": -1.1623822450637817,
"logps/chosen": -178.87374877929688,
"logps/rejected": -243.98184204101562,
"loss": 0.4396,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.001848660409450531,
"rewards/margins": 0.0008851997554302216,
"rewards/rejected": 0.000963456928730011,
"step": 25
},
{
"debug/policy_chosen_logits": -1.1025017499923706,
"debug/policy_chosen_logps": -173.5986328125,
"debug/policy_rejected_logits": -1.1473654508590698,
"debug/policy_rejected_logps": -245.47994995117188,
"debug/reference_chosen_logps": -186.88778686523438,
"debug/reference_rejected_logps": -241.27210998535156,
"epoch": 0.40625,
"grad_norm": 50.21384448251296,
"learning_rate": 1e-06,
"logits/chosen": -1.1025017499923706,
"logits/rejected": -1.1473654508590698,
"logps/chosen": -173.5986328125,
"logps/rejected": -245.47994995117188,
"loss": 0.4023,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.13289162516593933,
"rewards/margins": 0.1749698668718338,
"rewards/rejected": -0.04207824170589447,
"step": 26
},
{
"debug/policy_chosen_logits": -1.0175386667251587,
"debug/policy_chosen_logps": -94.42786407470703,
"debug/policy_rejected_logits": -1.1346431970596313,
"debug/policy_rejected_logps": -240.36541748046875,
"debug/reference_chosen_logps": -106.56871032714844,
"debug/reference_rejected_logps": -240.29310607910156,
"epoch": 0.421875,
"grad_norm": 17.96088818186707,
"learning_rate": 1e-06,
"logits/chosen": -1.0175386667251587,
"logits/rejected": -1.1346431970596313,
"logps/chosen": -94.42786407470703,
"logps/rejected": -240.36541748046875,
"loss": 0.39,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.12140841782093048,
"rewards/margins": 0.12213139981031418,
"rewards/rejected": -0.0007229708135128021,
"step": 27
},
{
"debug/policy_chosen_logits": -1.1408073902130127,
"debug/policy_chosen_logps": -126.60142517089844,
"debug/policy_rejected_logits": -1.20956289768219,
"debug/policy_rejected_logps": -313.8656311035156,
"debug/reference_chosen_logps": -130.33799743652344,
"debug/reference_rejected_logps": -291.3277893066406,
"epoch": 0.4375,
"grad_norm": 29.993745130410183,
"learning_rate": 1e-06,
"logits/chosen": -1.1408073902130127,
"logits/rejected": -1.20956289768219,
"logps/chosen": -126.60142517089844,
"logps/rejected": -313.8656311035156,
"loss": 0.3947,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.03736574202775955,
"rewards/margins": 0.2627440094947815,
"rewards/rejected": -0.22537828981876373,
"step": 28
},
{
"debug/policy_chosen_logits": -1.0291798114776611,
"debug/policy_chosen_logps": -192.92529296875,
"debug/policy_rejected_logits": -1.2137432098388672,
"debug/policy_rejected_logps": -315.1015930175781,
"debug/reference_chosen_logps": -184.9921875,
"debug/reference_rejected_logps": -301.6517639160156,
"epoch": 0.453125,
"grad_norm": 51.5929899248971,
"learning_rate": 1e-06,
"logits/chosen": -1.0291798114776611,
"logits/rejected": -1.2137432098388672,
"logps/chosen": -192.92529296875,
"logps/rejected": -315.1015930175781,
"loss": 0.4253,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0793309286236763,
"rewards/margins": 0.05516732484102249,
"rewards/rejected": -0.1344982385635376,
"step": 29
},
{
"debug/policy_chosen_logits": -1.033249020576477,
"debug/policy_chosen_logps": -129.13734436035156,
"debug/policy_rejected_logits": -1.1481682062149048,
"debug/policy_rejected_logps": -319.0918884277344,
"debug/reference_chosen_logps": -134.66598510742188,
"debug/reference_rejected_logps": -297.1129150390625,
"epoch": 0.46875,
"grad_norm": 41.13041833853564,
"learning_rate": 1e-06,
"logits/chosen": -1.033249020576477,
"logits/rejected": -1.1481682062149048,
"logps/chosen": -129.13734436035156,
"logps/rejected": -319.0918884277344,
"loss": 0.4069,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.05528645217418671,
"rewards/margins": 0.2750762701034546,
"rewards/rejected": -0.2197897881269455,
"step": 30
},
{
"debug/policy_chosen_logits": -1.1428550481796265,
"debug/policy_chosen_logps": -174.7340087890625,
"debug/policy_rejected_logits": -1.017913818359375,
"debug/policy_rejected_logps": -238.23471069335938,
"debug/reference_chosen_logps": -180.0450897216797,
"debug/reference_rejected_logps": -228.79031372070312,
"epoch": 0.484375,
"grad_norm": 54.8216481339695,
"learning_rate": 1e-06,
"logits/chosen": -1.1428550481796265,
"logits/rejected": -1.017913818359375,
"logps/chosen": -174.7340087890625,
"logps/rejected": -238.23471069335938,
"loss": 0.4467,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.053110986948013306,
"rewards/margins": 0.1475549191236496,
"rewards/rejected": -0.0944439247250557,
"step": 31
},
{
"debug/policy_chosen_logits": -1.0067996978759766,
"debug/policy_chosen_logps": -145.49851989746094,
"debug/policy_rejected_logits": -1.210583209991455,
"debug/policy_rejected_logps": -274.90240478515625,
"debug/reference_chosen_logps": -151.12542724609375,
"debug/reference_rejected_logps": -264.36016845703125,
"epoch": 0.5,
"grad_norm": 31.943528016300796,
"learning_rate": 1e-06,
"logits/chosen": -1.0067996978759766,
"logits/rejected": -1.210583209991455,
"logps/chosen": -145.49851989746094,
"logps/rejected": -274.90240478515625,
"loss": 0.3966,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05626893788576126,
"rewards/margins": 0.16169115900993347,
"rewards/rejected": -0.10542222112417221,
"step": 32
},
{
"debug/policy_chosen_logits": -1.1181310415267944,
"debug/policy_chosen_logps": -154.81201171875,
"debug/policy_rejected_logits": -1.2310353517532349,
"debug/policy_rejected_logps": -287.8173828125,
"debug/reference_chosen_logps": -170.07876586914062,
"debug/reference_rejected_logps": -274.1385498046875,
"epoch": 0.515625,
"grad_norm": 18.618810659036946,
"learning_rate": 1e-06,
"logits/chosen": -1.1181310415267944,
"logits/rejected": -1.2310353517532349,
"logps/chosen": -154.81201171875,
"logps/rejected": -287.8173828125,
"loss": 0.3581,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.1526675671339035,
"rewards/margins": 0.2894558906555176,
"rewards/rejected": -0.13678830862045288,
"step": 33
},
{
"debug/policy_chosen_logits": -1.0529826879501343,
"debug/policy_chosen_logps": -128.18128967285156,
"debug/policy_rejected_logits": -1.2277421951293945,
"debug/policy_rejected_logps": -326.91705322265625,
"debug/reference_chosen_logps": -147.74295043945312,
"debug/reference_rejected_logps": -300.6445617675781,
"epoch": 0.53125,
"grad_norm": 19.76877319971208,
"learning_rate": 1e-06,
"logits/chosen": -1.0529826879501343,
"logits/rejected": -1.2277421951293945,
"logps/chosen": -128.18128967285156,
"logps/rejected": -326.91705322265625,
"loss": 0.3702,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.19561666250228882,
"rewards/margins": 0.4583418369293213,
"rewards/rejected": -0.26272517442703247,
"step": 34
},
{
"debug/policy_chosen_logits": -1.0484968423843384,
"debug/policy_chosen_logps": -177.14181518554688,
"debug/policy_rejected_logits": -1.0831434726715088,
"debug/policy_rejected_logps": -277.63067626953125,
"debug/reference_chosen_logps": -184.79954528808594,
"debug/reference_rejected_logps": -262.337646484375,
"epoch": 0.546875,
"grad_norm": 13.405977545151604,
"learning_rate": 1e-06,
"logits/chosen": -1.0484968423843384,
"logits/rejected": -1.0831434726715088,
"logps/chosen": -177.14181518554688,
"logps/rejected": -277.63067626953125,
"loss": 0.3774,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.07657738029956818,
"rewards/margins": 0.22950761020183563,
"rewards/rejected": -0.15293022990226746,
"step": 35
},
{
"debug/policy_chosen_logits": -1.070804476737976,
"debug/policy_chosen_logps": -119.32257080078125,
"debug/policy_rejected_logits": -1.1960089206695557,
"debug/policy_rejected_logps": -257.6097412109375,
"debug/reference_chosen_logps": -134.1144561767578,
"debug/reference_rejected_logps": -249.19239807128906,
"epoch": 0.5625,
"grad_norm": 54.78669264655883,
"learning_rate": 1e-06,
"logits/chosen": -1.070804476737976,
"logits/rejected": -1.1960089206695557,
"logps/chosen": -119.32257080078125,
"logps/rejected": -257.6097412109375,
"loss": 0.4202,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.1479189097881317,
"rewards/margins": 0.23209232091903687,
"rewards/rejected": -0.08417341113090515,
"step": 36
},
{
"debug/policy_chosen_logits": -1.0936942100524902,
"debug/policy_chosen_logps": -198.59994506835938,
"debug/policy_rejected_logits": -1.1287853717803955,
"debug/policy_rejected_logps": -267.81048583984375,
"debug/reference_chosen_logps": -206.68980407714844,
"debug/reference_rejected_logps": -260.12896728515625,
"epoch": 0.578125,
"grad_norm": 18.618309162410206,
"learning_rate": 1e-06,
"logits/chosen": -1.0936942100524902,
"logits/rejected": -1.1287853717803955,
"logps/chosen": -198.59994506835938,
"logps/rejected": -267.81048583984375,
"loss": 0.3922,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.08089858293533325,
"rewards/margins": 0.15771383047103882,
"rewards/rejected": -0.07681524008512497,
"step": 37
},
{
"debug/policy_chosen_logits": -1.0987818241119385,
"debug/policy_chosen_logps": -156.1143798828125,
"debug/policy_rejected_logits": -1.016094446182251,
"debug/policy_rejected_logps": -280.1226806640625,
"debug/reference_chosen_logps": -174.13986206054688,
"debug/reference_rejected_logps": -272.59063720703125,
"epoch": 0.59375,
"grad_norm": 48.92722394829403,
"learning_rate": 1e-06,
"logits/chosen": -1.0987818241119385,
"logits/rejected": -1.016094446182251,
"logps/chosen": -156.1143798828125,
"logps/rejected": -280.1226806640625,
"loss": 0.4235,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.18025492131710052,
"rewards/margins": 0.2555754780769348,
"rewards/rejected": -0.07532056421041489,
"step": 38
},
{
"debug/policy_chosen_logits": -1.1431177854537964,
"debug/policy_chosen_logps": -121.45298767089844,
"debug/policy_rejected_logits": -1.2573899030685425,
"debug/policy_rejected_logps": -243.77618408203125,
"debug/reference_chosen_logps": -132.9182891845703,
"debug/reference_rejected_logps": -236.6573486328125,
"epoch": 0.609375,
"grad_norm": 28.262320173832173,
"learning_rate": 1e-06,
"logits/chosen": -1.1431177854537964,
"logits/rejected": -1.2573899030685425,
"logps/chosen": -121.45298767089844,
"logps/rejected": -243.77618408203125,
"loss": 0.3976,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.11465291678905487,
"rewards/margins": 0.18584111332893372,
"rewards/rejected": -0.07118818163871765,
"step": 39
},
{
"debug/policy_chosen_logits": -1.1291528940200806,
"debug/policy_chosen_logps": -124.88560485839844,
"debug/policy_rejected_logits": -1.1997623443603516,
"debug/policy_rejected_logps": -341.6507568359375,
"debug/reference_chosen_logps": -145.1587677001953,
"debug/reference_rejected_logps": -316.4557189941406,
"epoch": 0.625,
"grad_norm": 17.205504877297493,
"learning_rate": 1e-06,
"logits/chosen": -1.1291528940200806,
"logits/rejected": -1.1997623443603516,
"logps/chosen": -124.88560485839844,
"logps/rejected": -341.6507568359375,
"loss": 0.3983,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.202731654047966,
"rewards/margins": 0.4546818137168884,
"rewards/rejected": -0.25195014476776123,
"step": 40
},
{
"debug/policy_chosen_logits": -1.119407057762146,
"debug/policy_chosen_logps": -155.58392333984375,
"debug/policy_rejected_logits": -1.165313720703125,
"debug/policy_rejected_logps": -216.57156372070312,
"debug/reference_chosen_logps": -161.89459228515625,
"debug/reference_rejected_logps": -214.2755126953125,
"epoch": 0.640625,
"grad_norm": 20.732094832807366,
"learning_rate": 1e-06,
"logits/chosen": -1.119407057762146,
"logits/rejected": -1.165313720703125,
"logps/chosen": -155.58392333984375,
"logps/rejected": -216.57156372070312,
"loss": 0.3763,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.06310684233903885,
"rewards/margins": 0.0860673040151596,
"rewards/rejected": -0.02296045981347561,
"step": 41
},
{
"debug/policy_chosen_logits": -1.2078365087509155,
"debug/policy_chosen_logps": -137.1336212158203,
"debug/policy_rejected_logits": -1.2154945135116577,
"debug/policy_rejected_logps": -227.4922637939453,
"debug/reference_chosen_logps": -139.9180145263672,
"debug/reference_rejected_logps": -215.813232421875,
"epoch": 0.65625,
"grad_norm": 34.027873181354636,
"learning_rate": 1e-06,
"logits/chosen": -1.2078365087509155,
"logits/rejected": -1.2154945135116577,
"logps/chosen": -137.1336212158203,
"logps/rejected": -227.4922637939453,
"loss": 0.4182,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.02784401923418045,
"rewards/margins": 0.14463430643081665,
"rewards/rejected": -0.1167902946472168,
"step": 42
},
{
"debug/policy_chosen_logits": -1.0180912017822266,
"debug/policy_chosen_logps": -173.8270263671875,
"debug/policy_rejected_logits": -1.1830826997756958,
"debug/policy_rejected_logps": -286.73638916015625,
"debug/reference_chosen_logps": -174.58895874023438,
"debug/reference_rejected_logps": -263.51458740234375,
"epoch": 0.671875,
"grad_norm": 26.885686366047068,
"learning_rate": 1e-06,
"logits/chosen": -1.0180912017822266,
"logits/rejected": -1.1830826997756958,
"logps/chosen": -173.8270263671875,
"logps/rejected": -286.73638916015625,
"loss": 0.3939,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.007619347423315048,
"rewards/margins": 0.2398374080657959,
"rewards/rejected": -0.23221805691719055,
"step": 43
},
{
"debug/policy_chosen_logits": -0.8629423379898071,
"debug/policy_chosen_logps": -186.4468994140625,
"debug/policy_rejected_logits": -1.196955680847168,
"debug/policy_rejected_logps": -291.8290710449219,
"debug/reference_chosen_logps": -192.09939575195312,
"debug/reference_rejected_logps": -283.04547119140625,
"epoch": 0.6875,
"grad_norm": 15.341359477798175,
"learning_rate": 1e-06,
"logits/chosen": -0.8629423379898071,
"logits/rejected": -1.196955680847168,
"logps/chosen": -186.4468994140625,
"logps/rejected": -291.8290710449219,
"loss": 0.3941,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.05652495473623276,
"rewards/margins": 0.14436087012290955,
"rewards/rejected": -0.08783592283725739,
"step": 44
},
{
"debug/policy_chosen_logits": -1.1474281549453735,
"debug/policy_chosen_logps": -185.1705322265625,
"debug/policy_rejected_logits": -1.2113550901412964,
"debug/policy_rejected_logps": -299.13165283203125,
"debug/reference_chosen_logps": -184.02684020996094,
"debug/reference_rejected_logps": -283.3847961425781,
"epoch": 0.703125,
"grad_norm": 27.424785120293386,
"learning_rate": 1e-06,
"logits/chosen": -1.1474281549453735,
"logits/rejected": -1.2113550901412964,
"logps/chosen": -185.1705322265625,
"logps/rejected": -299.13165283203125,
"loss": 0.4015,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.011436812579631805,
"rewards/margins": 0.1460317075252533,
"rewards/rejected": -0.1574685126543045,
"step": 45
},
{
"debug/policy_chosen_logits": -1.0573773384094238,
"debug/policy_chosen_logps": -127.71075439453125,
"debug/policy_rejected_logits": -1.0924162864685059,
"debug/policy_rejected_logps": -323.93768310546875,
"debug/reference_chosen_logps": -139.21630859375,
"debug/reference_rejected_logps": -311.1994323730469,
"epoch": 0.71875,
"grad_norm": 17.144934905131425,
"learning_rate": 1e-06,
"logits/chosen": -1.0573773384094238,
"logits/rejected": -1.0924162864685059,
"logps/chosen": -127.71075439453125,
"logps/rejected": -323.93768310546875,
"loss": 0.3624,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.11505550146102905,
"rewards/margins": 0.24243810772895813,
"rewards/rejected": -0.12738259136676788,
"step": 46
},
{
"debug/policy_chosen_logits": -1.0909616947174072,
"debug/policy_chosen_logps": -137.27731323242188,
"debug/policy_rejected_logits": -1.2138352394104004,
"debug/policy_rejected_logps": -241.8701171875,
"debug/reference_chosen_logps": -147.23553466796875,
"debug/reference_rejected_logps": -222.49639892578125,
"epoch": 0.734375,
"grad_norm": 12.93169650628382,
"learning_rate": 1e-06,
"logits/chosen": -1.0909616947174072,
"logits/rejected": -1.2138352394104004,
"logps/chosen": -137.27731323242188,
"logps/rejected": -241.8701171875,
"loss": 0.3217,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.0995820164680481,
"rewards/margins": 0.2933192849159241,
"rewards/rejected": -0.19373726844787598,
"step": 47
},
{
"debug/policy_chosen_logits": -1.096240520477295,
"debug/policy_chosen_logps": -232.75778198242188,
"debug/policy_rejected_logits": -1.1766290664672852,
"debug/policy_rejected_logps": -306.53369140625,
"debug/reference_chosen_logps": -230.5318145751953,
"debug/reference_rejected_logps": -294.82598876953125,
"epoch": 0.75,
"grad_norm": 26.099751982850893,
"learning_rate": 1e-06,
"logits/chosen": -1.096240520477295,
"logits/rejected": -1.1766290664672852,
"logps/chosen": -232.75778198242188,
"logps/rejected": -306.53369140625,
"loss": 0.4361,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.022259674966335297,
"rewards/margins": 0.09481699019670486,
"rewards/rejected": -0.11707665771245956,
"step": 48
},
{
"debug/policy_chosen_logits": -1.1644705533981323,
"debug/policy_chosen_logps": -166.67062377929688,
"debug/policy_rejected_logits": -1.293932557106018,
"debug/policy_rejected_logps": -293.45050048828125,
"debug/reference_chosen_logps": -175.53598022460938,
"debug/reference_rejected_logps": -276.24322509765625,
"epoch": 0.765625,
"grad_norm": 18.426480334845714,
"learning_rate": 1e-06,
"logits/chosen": -1.1644705533981323,
"logits/rejected": -1.293932557106018,
"logps/chosen": -166.67062377929688,
"logps/rejected": -293.45050048828125,
"loss": 0.4144,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.08865345269441605,
"rewards/margins": 0.2607261538505554,
"rewards/rejected": -0.17207267880439758,
"step": 49
},
{
"debug/policy_chosen_logits": -1.2152189016342163,
"debug/policy_chosen_logps": -170.15440368652344,
"debug/policy_rejected_logits": -1.2675527334213257,
"debug/policy_rejected_logps": -284.37353515625,
"debug/reference_chosen_logps": -173.90533447265625,
"debug/reference_rejected_logps": -265.96417236328125,
"epoch": 0.78125,
"grad_norm": 19.567832925259168,
"learning_rate": 1e-06,
"logits/chosen": -1.2152189016342163,
"logits/rejected": -1.2675527334213257,
"logps/chosen": -170.15440368652344,
"logps/rejected": -284.37353515625,
"loss": 0.3895,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03750941902399063,
"rewards/margins": 0.22160324454307556,
"rewards/rejected": -0.18409383296966553,
"step": 50
},
{
"debug/policy_chosen_logits": -1.1334317922592163,
"debug/policy_chosen_logps": -127.97447204589844,
"debug/policy_rejected_logits": -1.07590651512146,
"debug/policy_rejected_logps": -220.5333251953125,
"debug/reference_chosen_logps": -136.77487182617188,
"debug/reference_rejected_logps": -219.20693969726562,
"epoch": 0.796875,
"grad_norm": 37.00007516828202,
"learning_rate": 1e-06,
"logits/chosen": -1.1334317922592163,
"logits/rejected": -1.07590651512146,
"logps/chosen": -127.97447204589844,
"logps/rejected": -220.5333251953125,
"loss": 0.3521,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0880039781332016,
"rewards/margins": 0.10126776248216629,
"rewards/rejected": -0.013263778761029243,
"step": 51
},
{
"debug/policy_chosen_logits": -1.207089900970459,
"debug/policy_chosen_logps": -149.59579467773438,
"debug/policy_rejected_logits": -1.3598229885101318,
"debug/policy_rejected_logps": -312.65423583984375,
"debug/reference_chosen_logps": -160.83349609375,
"debug/reference_rejected_logps": -290.1050109863281,
"epoch": 0.8125,
"grad_norm": 34.43193601355931,
"learning_rate": 1e-06,
"logits/chosen": -1.207089900970459,
"logits/rejected": -1.3598229885101318,
"logps/chosen": -149.59579467773438,
"logps/rejected": -312.65423583984375,
"loss": 0.3701,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.11237694323062897,
"rewards/margins": 0.3378693461418152,
"rewards/rejected": -0.22549240291118622,
"step": 52
},
{
"debug/policy_chosen_logits": -1.0188125371932983,
"debug/policy_chosen_logps": -185.8585205078125,
"debug/policy_rejected_logits": -1.0791672468185425,
"debug/policy_rejected_logps": -251.5456085205078,
"debug/reference_chosen_logps": -191.01089477539062,
"debug/reference_rejected_logps": -245.11524963378906,
"epoch": 0.828125,
"grad_norm": 14.59754124103045,
"learning_rate": 1e-06,
"logits/chosen": -1.0188125371932983,
"logits/rejected": -1.0791672468185425,
"logps/chosen": -185.8585205078125,
"logps/rejected": -251.5456085205078,
"loss": 0.3737,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.051523588597774506,
"rewards/margins": 0.1158272996544838,
"rewards/rejected": -0.0643036961555481,
"step": 53
},
{
"debug/policy_chosen_logits": -1.0893926620483398,
"debug/policy_chosen_logps": -149.98660278320312,
"debug/policy_rejected_logits": -1.0650213956832886,
"debug/policy_rejected_logps": -274.9080810546875,
"debug/reference_chosen_logps": -158.46145629882812,
"debug/reference_rejected_logps": -258.4507141113281,
"epoch": 0.84375,
"grad_norm": 14.810580428901549,
"learning_rate": 1e-06,
"logits/chosen": -1.0893926620483398,
"logits/rejected": -1.0650213956832886,
"logps/chosen": -149.98660278320312,
"logps/rejected": -274.9080810546875,
"loss": 0.3213,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.0847485214471817,
"rewards/margins": 0.24932223558425903,
"rewards/rejected": -0.16457369923591614,
"step": 54
},
{
"debug/policy_chosen_logits": -1.2362074851989746,
"debug/policy_chosen_logps": -102.0992202758789,
"debug/policy_rejected_logits": -1.3010079860687256,
"debug/policy_rejected_logps": -289.4234313964844,
"debug/reference_chosen_logps": -120.96076965332031,
"debug/reference_rejected_logps": -275.486083984375,
"epoch": 0.859375,
"grad_norm": 13.99372317117744,
"learning_rate": 1e-06,
"logits/chosen": -1.2362074851989746,
"logits/rejected": -1.3010079860687256,
"logps/chosen": -102.0992202758789,
"logps/rejected": -289.4234313964844,
"loss": 0.4006,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18861553072929382,
"rewards/margins": 0.32798925042152405,
"rewards/rejected": -0.13937373459339142,
"step": 55
},
{
"debug/policy_chosen_logits": -1.0959794521331787,
"debug/policy_chosen_logps": -176.76089477539062,
"debug/policy_rejected_logits": -1.291311264038086,
"debug/policy_rejected_logps": -313.87506103515625,
"debug/reference_chosen_logps": -185.58998107910156,
"debug/reference_rejected_logps": -290.28045654296875,
"epoch": 0.875,
"grad_norm": 37.669129247782706,
"learning_rate": 1e-06,
"logits/chosen": -1.0959794521331787,
"logits/rejected": -1.291311264038086,
"logps/chosen": -176.76089477539062,
"logps/rejected": -313.87506103515625,
"loss": 0.3289,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.08829064667224884,
"rewards/margins": 0.3242364823818207,
"rewards/rejected": -0.23594582080841064,
"step": 56
},
{
"debug/policy_chosen_logits": -1.0723934173583984,
"debug/policy_chosen_logps": -127.6189193725586,
"debug/policy_rejected_logits": -1.1941779851913452,
"debug/policy_rejected_logps": -263.9356689453125,
"debug/reference_chosen_logps": -139.6109161376953,
"debug/reference_rejected_logps": -251.62448120117188,
"epoch": 0.890625,
"grad_norm": 15.916622092420505,
"learning_rate": 1e-06,
"logits/chosen": -1.0723934173583984,
"logits/rejected": -1.1941779851913452,
"logps/chosen": -127.6189193725586,
"logps/rejected": -263.9356689453125,
"loss": 0.3641,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.11991991102695465,
"rewards/margins": 0.24303147196769714,
"rewards/rejected": -0.12311156839132309,
"step": 57
},
{
"debug/policy_chosen_logits": -1.083500862121582,
"debug/policy_chosen_logps": -212.94515991210938,
"debug/policy_rejected_logits": -1.196679711341858,
"debug/policy_rejected_logps": -263.7575378417969,
"debug/reference_chosen_logps": -221.95928955078125,
"debug/reference_rejected_logps": -267.07586669921875,
"epoch": 0.90625,
"grad_norm": 26.520012974267605,
"learning_rate": 1e-06,
"logits/chosen": -1.083500862121582,
"logits/rejected": -1.196679711341858,
"logps/chosen": -212.94515991210938,
"logps/rejected": -263.7575378417969,
"loss": 0.4082,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.09014149010181427,
"rewards/margins": 0.05695834010839462,
"rewards/rejected": 0.03318314626812935,
"step": 58
},
{
"debug/policy_chosen_logits": -1.2750979661941528,
"debug/policy_chosen_logps": -120.48554229736328,
"debug/policy_rejected_logits": -1.2684656381607056,
"debug/policy_rejected_logps": -331.54986572265625,
"debug/reference_chosen_logps": -130.17742919921875,
"debug/reference_rejected_logps": -307.9356689453125,
"epoch": 0.921875,
"grad_norm": 19.676716039926774,
"learning_rate": 1e-06,
"logits/chosen": -1.2750979661941528,
"logits/rejected": -1.2684656381607056,
"logps/chosen": -120.48554229736328,
"logps/rejected": -331.54986572265625,
"loss": 0.3486,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.09691886603832245,
"rewards/margins": 0.3330605924129486,
"rewards/rejected": -0.23614171147346497,
"step": 59
},
{
"debug/policy_chosen_logits": -1.1700026988983154,
"debug/policy_chosen_logps": -164.0662078857422,
"debug/policy_rejected_logits": -1.0647200345993042,
"debug/policy_rejected_logps": -289.3599548339844,
"debug/reference_chosen_logps": -172.45896911621094,
"debug/reference_rejected_logps": -276.63592529296875,
"epoch": 0.9375,
"grad_norm": 16.5920657057711,
"learning_rate": 1e-06,
"logits/chosen": -1.1700026988983154,
"logits/rejected": -1.0647200345993042,
"logps/chosen": -164.0662078857422,
"logps/rejected": -289.3599548339844,
"loss": 0.3551,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.08392763137817383,
"rewards/margins": 0.21116778254508972,
"rewards/rejected": -0.1272401511669159,
"step": 60
},
{
"debug/policy_chosen_logits": -1.2429842948913574,
"debug/policy_chosen_logps": -164.30996704101562,
"debug/policy_rejected_logits": -1.2771668434143066,
"debug/policy_rejected_logps": -291.66436767578125,
"debug/reference_chosen_logps": -178.3618927001953,
"debug/reference_rejected_logps": -263.1362609863281,
"epoch": 0.953125,
"grad_norm": 17.917957649513887,
"learning_rate": 1e-06,
"logits/chosen": -1.2429842948913574,
"logits/rejected": -1.2771668434143066,
"logps/chosen": -164.30996704101562,
"logps/rejected": -291.66436767578125,
"loss": 0.333,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14051929116249084,
"rewards/margins": 0.42580026388168335,
"rewards/rejected": -0.2852809429168701,
"step": 61
},
{
"debug/policy_chosen_logits": -1.1629077196121216,
"debug/policy_chosen_logps": -171.4347381591797,
"debug/policy_rejected_logits": -1.2383259534835815,
"debug/policy_rejected_logps": -257.24322509765625,
"debug/reference_chosen_logps": -176.8075408935547,
"debug/reference_rejected_logps": -236.5648193359375,
"epoch": 0.96875,
"grad_norm": 27.22416658714319,
"learning_rate": 1e-06,
"logits/chosen": -1.1629077196121216,
"logits/rejected": -1.2383259534835815,
"logps/chosen": -171.4347381591797,
"logps/rejected": -257.24322509765625,
"loss": 0.3685,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.053728047758340836,
"rewards/margins": 0.2605122923851013,
"rewards/rejected": -0.20678424835205078,
"step": 62
},
{
"debug/policy_chosen_logits": -1.167179822921753,
"debug/policy_chosen_logps": -241.232666015625,
"debug/policy_rejected_logits": -1.1904563903808594,
"debug/policy_rejected_logps": -349.2745361328125,
"debug/reference_chosen_logps": -237.09837341308594,
"debug/reference_rejected_logps": -312.7959289550781,
"epoch": 0.984375,
"grad_norm": 35.440096057306455,
"learning_rate": 1e-06,
"logits/chosen": -1.167179822921753,
"logits/rejected": -1.1904563903808594,
"logps/chosen": -241.232666015625,
"logps/rejected": -349.2745361328125,
"loss": 0.3671,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.04134296253323555,
"rewards/margins": 0.32344281673431396,
"rewards/rejected": -0.3647857904434204,
"step": 63
},
{
"debug/policy_chosen_logits": -1.1739040613174438,
"debug/policy_chosen_logps": -139.26182556152344,
"debug/policy_rejected_logits": -1.2884361743927002,
"debug/policy_rejected_logps": -280.71124267578125,
"debug/reference_chosen_logps": -148.495361328125,
"debug/reference_rejected_logps": -259.9752197265625,
"epoch": 1.0,
"grad_norm": 43.313159052812985,
"learning_rate": 1e-06,
"logits/chosen": -1.1739040613174438,
"logits/rejected": -1.2884361743927002,
"logps/chosen": -139.26182556152344,
"logps/rejected": -280.71124267578125,
"loss": 0.3453,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.09233523905277252,
"rewards/margins": 0.2996952533721924,
"rewards/rejected": -0.20735999941825867,
"step": 64
},
{
"epoch": 1.0,
"step": 64,
"total_flos": 0.0,
"train_loss": 0.40815131505951285,
"train_runtime": 194.2921,
"train_samples_per_second": 20.979,
"train_steps_per_second": 0.329
}
],
"logging_steps": 1,
"max_steps": 64,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}