qwen2_chat_reflct_adamw_iter7 / trainer_state.json
yiran-wang3's picture
End of training
82c52d6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 44,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": -1.4075380563735962,
"debug/policy_chosen_logps": -300.2610168457031,
"debug/policy_rejected_logits": -1.3621673583984375,
"debug/policy_rejected_logps": -278.803955078125,
"debug/reference_chosen_logps": -300.2610168457031,
"debug/reference_rejected_logps": -278.803955078125,
"epoch": 0.022727272727272728,
"grad_norm": 29.710163996651048,
"learning_rate": 1e-06,
"logits/chosen": -1.4075380563735962,
"logits/rejected": -1.3621673583984375,
"logps/chosen": -300.2610168457031,
"logps/rejected": -278.803955078125,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": -1.658880352973938,
"debug/policy_chosen_logps": -246.17088317871094,
"debug/policy_rejected_logits": -1.3099008798599243,
"debug/policy_rejected_logps": -291.26806640625,
"debug/reference_chosen_logps": -249.18255615234375,
"debug/reference_rejected_logps": -293.00592041015625,
"epoch": 0.045454545454545456,
"grad_norm": 23.678192495361564,
"learning_rate": 1e-06,
"logits/chosen": -1.658880352973938,
"logits/rejected": -1.3099008798599243,
"logps/chosen": -246.17088317871094,
"logps/rejected": -291.26806640625,
"loss": 0.5075,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03011680766940117,
"rewards/margins": 0.012738131918013096,
"rewards/rejected": 0.01737867295742035,
"step": 2
},
{
"debug/policy_chosen_logits": -1.6099661588668823,
"debug/policy_chosen_logps": -235.6559600830078,
"debug/policy_rejected_logits": -1.2918318510055542,
"debug/policy_rejected_logps": -312.91632080078125,
"debug/reference_chosen_logps": -238.7340087890625,
"debug/reference_rejected_logps": -312.7275390625,
"epoch": 0.06818181818181818,
"grad_norm": 34.989549498202656,
"learning_rate": 1e-06,
"logits/chosen": -1.6099661588668823,
"logits/rejected": -1.2918318510055542,
"logps/chosen": -235.6559600830078,
"logps/rejected": -312.91632080078125,
"loss": 0.5084,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03078046813607216,
"rewards/margins": 0.032668597996234894,
"rewards/rejected": -0.0018881321884691715,
"step": 3
},
{
"debug/policy_chosen_logits": -1.3408846855163574,
"debug/policy_chosen_logps": -242.54039001464844,
"debug/policy_rejected_logits": -1.2990057468414307,
"debug/policy_rejected_logps": -295.8603515625,
"debug/reference_chosen_logps": -242.50985717773438,
"debug/reference_rejected_logps": -295.34893798828125,
"epoch": 0.09090909090909091,
"grad_norm": 32.869910385515084,
"learning_rate": 1e-06,
"logits/chosen": -1.3408846855163574,
"logits/rejected": -1.2990057468414307,
"logps/chosen": -242.54039001464844,
"logps/rejected": -295.8603515625,
"loss": 0.5033,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0003051948733627796,
"rewards/margins": 0.0048088072799146175,
"rewards/rejected": -0.005114002153277397,
"step": 4
},
{
"debug/policy_chosen_logits": -1.5846848487854004,
"debug/policy_chosen_logps": -272.6408386230469,
"debug/policy_rejected_logits": -1.2951120138168335,
"debug/policy_rejected_logps": -337.945068359375,
"debug/reference_chosen_logps": -272.4866943359375,
"debug/reference_rejected_logps": -338.4932861328125,
"epoch": 0.11363636363636363,
"grad_norm": 16.739518531494987,
"learning_rate": 1e-06,
"logits/chosen": -1.5846848487854004,
"logits/rejected": -1.2951120138168335,
"logps/chosen": -272.6408386230469,
"logps/rejected": -337.945068359375,
"loss": 0.4931,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.001541042234748602,
"rewards/margins": -0.007023181766271591,
"rewards/rejected": 0.005482139065861702,
"step": 5
},
{
"debug/policy_chosen_logits": -1.4175924062728882,
"debug/policy_chosen_logps": -237.64132690429688,
"debug/policy_rejected_logits": -1.3938319683074951,
"debug/policy_rejected_logps": -245.30130004882812,
"debug/reference_chosen_logps": -236.71168518066406,
"debug/reference_rejected_logps": -243.54733276367188,
"epoch": 0.13636363636363635,
"grad_norm": 33.05997081062945,
"learning_rate": 1e-06,
"logits/chosen": -1.4175924062728882,
"logits/rejected": -1.3938319683074951,
"logps/chosen": -237.64132690429688,
"logps/rejected": -245.30130004882812,
"loss": 0.4969,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00929628312587738,
"rewards/margins": 0.008243197575211525,
"rewards/rejected": -0.017539482563734055,
"step": 6
},
{
"debug/policy_chosen_logits": -1.4232984781265259,
"debug/policy_chosen_logps": -243.94329833984375,
"debug/policy_rejected_logits": -1.4116002321243286,
"debug/policy_rejected_logps": -248.73439025878906,
"debug/reference_chosen_logps": -241.92886352539062,
"debug/reference_rejected_logps": -247.91339111328125,
"epoch": 0.1590909090909091,
"grad_norm": 22.910586896410035,
"learning_rate": 1e-06,
"logits/chosen": -1.4232984781265259,
"logits/rejected": -1.4116002321243286,
"logps/chosen": -243.94329833984375,
"logps/rejected": -248.73439025878906,
"loss": 0.5036,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.020144157111644745,
"rewards/margins": -0.011934204027056694,
"rewards/rejected": -0.00820995308458805,
"step": 7
},
{
"debug/policy_chosen_logits": -1.4712331295013428,
"debug/policy_chosen_logps": -233.98004150390625,
"debug/policy_rejected_logits": -1.442948579788208,
"debug/policy_rejected_logps": -264.7082214355469,
"debug/reference_chosen_logps": -232.63677978515625,
"debug/reference_rejected_logps": -265.5264892578125,
"epoch": 0.18181818181818182,
"grad_norm": 15.718654227738572,
"learning_rate": 1e-06,
"logits/chosen": -1.4712331295013428,
"logits/rejected": -1.442948579788208,
"logps/chosen": -233.98004150390625,
"logps/rejected": -264.7082214355469,
"loss": 0.5057,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.013432646170258522,
"rewards/margins": -0.021615436300635338,
"rewards/rejected": 0.008182793855667114,
"step": 8
},
{
"debug/policy_chosen_logits": -1.5352590084075928,
"debug/policy_chosen_logps": -243.7238006591797,
"debug/policy_rejected_logits": -1.5029191970825195,
"debug/policy_rejected_logps": -266.58013916015625,
"debug/reference_chosen_logps": -241.81459045410156,
"debug/reference_rejected_logps": -264.5735778808594,
"epoch": 0.20454545454545456,
"grad_norm": 22.532643036918554,
"learning_rate": 1e-06,
"logits/chosen": -1.5352590084075928,
"logits/rejected": -1.5029191970825195,
"logps/chosen": -243.7238006591797,
"logps/rejected": -266.58013916015625,
"loss": 0.5012,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.019092101603746414,
"rewards/margins": 0.0009734919294714928,
"rewards/rejected": -0.02006559446454048,
"step": 9
},
{
"debug/policy_chosen_logits": -1.5353741645812988,
"debug/policy_chosen_logps": -268.5412902832031,
"debug/policy_rejected_logits": -1.5154732465744019,
"debug/policy_rejected_logps": -282.08551025390625,
"debug/reference_chosen_logps": -267.7362976074219,
"debug/reference_rejected_logps": -279.14666748046875,
"epoch": 0.22727272727272727,
"grad_norm": 23.319868382806877,
"learning_rate": 1e-06,
"logits/chosen": -1.5353741645812988,
"logits/rejected": -1.5154732465744019,
"logps/chosen": -268.5412902832031,
"logps/rejected": -282.08551025390625,
"loss": 0.5019,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.008050002157688141,
"rewards/margins": 0.02133851870894432,
"rewards/rejected": -0.02938852459192276,
"step": 10
},
{
"debug/policy_chosen_logits": -1.4472002983093262,
"debug/policy_chosen_logps": -240.2578125,
"debug/policy_rejected_logits": -1.392478108406067,
"debug/policy_rejected_logps": -324.4321594238281,
"debug/reference_chosen_logps": -234.3917236328125,
"debug/reference_rejected_logps": -319.2920837402344,
"epoch": 0.25,
"grad_norm": 17.992513913858307,
"learning_rate": 1e-06,
"logits/chosen": -1.4472002983093262,
"logits/rejected": -1.392478108406067,
"logps/chosen": -240.2578125,
"logps/rejected": -324.4321594238281,
"loss": 0.4961,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.05866073817014694,
"rewards/margins": -0.007260264828801155,
"rewards/rejected": -0.05140047147870064,
"step": 11
},
{
"debug/policy_chosen_logits": -1.3548312187194824,
"debug/policy_chosen_logps": -231.70309448242188,
"debug/policy_rejected_logits": -1.3657501935958862,
"debug/policy_rejected_logps": -271.7943420410156,
"debug/reference_chosen_logps": -226.82766723632812,
"debug/reference_rejected_logps": -266.321533203125,
"epoch": 0.2727272727272727,
"grad_norm": 73.41147441549734,
"learning_rate": 1e-06,
"logits/chosen": -1.3548312187194824,
"logits/rejected": -1.3657501935958862,
"logps/chosen": -231.70309448242188,
"logps/rejected": -271.7943420410156,
"loss": 0.5095,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04875440523028374,
"rewards/margins": 0.0059736063703894615,
"rewards/rejected": -0.054728008806705475,
"step": 12
},
{
"debug/policy_chosen_logits": -1.5568829774856567,
"debug/policy_chosen_logps": -251.14842224121094,
"debug/policy_rejected_logits": -1.3776763677597046,
"debug/policy_rejected_logps": -241.90863037109375,
"debug/reference_chosen_logps": -250.3316650390625,
"debug/reference_rejected_logps": -240.4967803955078,
"epoch": 0.29545454545454547,
"grad_norm": 31.567012366501405,
"learning_rate": 1e-06,
"logits/chosen": -1.5568829774856567,
"logits/rejected": -1.3776763677597046,
"logps/chosen": -251.14842224121094,
"logps/rejected": -241.90863037109375,
"loss": 0.5034,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.008167648687958717,
"rewards/margins": 0.005950965452939272,
"rewards/rejected": -0.014118613675236702,
"step": 13
},
{
"debug/policy_chosen_logits": -1.2634949684143066,
"debug/policy_chosen_logps": -259.905517578125,
"debug/policy_rejected_logits": -1.2515358924865723,
"debug/policy_rejected_logps": -358.7181091308594,
"debug/reference_chosen_logps": -257.572021484375,
"debug/reference_rejected_logps": -352.8564758300781,
"epoch": 0.3181818181818182,
"grad_norm": 33.36598159337087,
"learning_rate": 1e-06,
"logits/chosen": -1.2634949684143066,
"logits/rejected": -1.2515358924865723,
"logps/chosen": -259.905517578125,
"logps/rejected": -358.7181091308594,
"loss": 0.4959,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.023335151374340057,
"rewards/margins": 0.03528135269880295,
"rewards/rejected": -0.058616504073143005,
"step": 14
},
{
"debug/policy_chosen_logits": -1.5755295753479004,
"debug/policy_chosen_logps": -271.21990966796875,
"debug/policy_rejected_logits": -1.5602691173553467,
"debug/policy_rejected_logps": -291.444580078125,
"debug/reference_chosen_logps": -269.54534912109375,
"debug/reference_rejected_logps": -286.5006408691406,
"epoch": 0.3409090909090909,
"grad_norm": 19.995661189240433,
"learning_rate": 1e-06,
"logits/chosen": -1.5755295753479004,
"logits/rejected": -1.5602691173553467,
"logps/chosen": -271.21990966796875,
"logps/rejected": -291.444580078125,
"loss": 0.4975,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.01674526184797287,
"rewards/margins": 0.032694172114133835,
"rewards/rejected": -0.049439430236816406,
"step": 15
},
{
"debug/policy_chosen_logits": -1.5511504411697388,
"debug/policy_chosen_logps": -207.6850128173828,
"debug/policy_rejected_logits": -1.3656481504440308,
"debug/policy_rejected_logps": -259.1358642578125,
"debug/reference_chosen_logps": -206.66064453125,
"debug/reference_rejected_logps": -257.42388916015625,
"epoch": 0.36363636363636365,
"grad_norm": 20.444894909265596,
"learning_rate": 1e-06,
"logits/chosen": -1.5511504411697388,
"logits/rejected": -1.3656481504440308,
"logps/chosen": -207.6850128173828,
"logps/rejected": -259.1358642578125,
"loss": 0.4822,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.01024345401674509,
"rewards/margins": 0.006875896360725164,
"rewards/rejected": -0.017119349911808968,
"step": 16
},
{
"debug/policy_chosen_logits": -1.440645456314087,
"debug/policy_chosen_logps": -246.9857940673828,
"debug/policy_rejected_logits": -1.220913290977478,
"debug/policy_rejected_logps": -311.4046325683594,
"debug/reference_chosen_logps": -246.19845581054688,
"debug/reference_rejected_logps": -311.56048583984375,
"epoch": 0.38636363636363635,
"grad_norm": 24.901672749840092,
"learning_rate": 1e-06,
"logits/chosen": -1.440645456314087,
"logits/rejected": -1.220913290977478,
"logps/chosen": -246.9857940673828,
"logps/rejected": -311.4046325683594,
"loss": 0.4986,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.007873362861573696,
"rewards/margins": -0.009431838989257812,
"rewards/rejected": 0.001558477059006691,
"step": 17
},
{
"debug/policy_chosen_logits": -1.410476803779602,
"debug/policy_chosen_logps": -272.2285461425781,
"debug/policy_rejected_logits": -1.310629963874817,
"debug/policy_rejected_logps": -228.26686096191406,
"debug/reference_chosen_logps": -270.7967224121094,
"debug/reference_rejected_logps": -229.7943878173828,
"epoch": 0.4090909090909091,
"grad_norm": 38.60148122412081,
"learning_rate": 1e-06,
"logits/chosen": -1.410476803779602,
"logits/rejected": -1.310629963874817,
"logps/chosen": -272.2285461425781,
"logps/rejected": -228.26686096191406,
"loss": 0.4949,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.014318370260298252,
"rewards/margins": -0.029593737795948982,
"rewards/rejected": 0.015275364741683006,
"step": 18
},
{
"debug/policy_chosen_logits": -1.1632928848266602,
"debug/policy_chosen_logps": -240.0203399658203,
"debug/policy_rejected_logits": -1.5429524183273315,
"debug/policy_rejected_logps": -275.91461181640625,
"debug/reference_chosen_logps": -239.4716796875,
"debug/reference_rejected_logps": -274.268310546875,
"epoch": 0.4318181818181818,
"grad_norm": 60.28746463237661,
"learning_rate": 1e-06,
"logits/chosen": -1.1632928848266602,
"logits/rejected": -1.5429524183273315,
"logps/chosen": -240.0203399658203,
"logps/rejected": -275.91461181640625,
"loss": 0.518,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.005486545152962208,
"rewards/margins": 0.010976830497384071,
"rewards/rejected": -0.016463376581668854,
"step": 19
},
{
"debug/policy_chosen_logits": -1.521999478340149,
"debug/policy_chosen_logps": -278.55535888671875,
"debug/policy_rejected_logits": -1.4620181322097778,
"debug/policy_rejected_logps": -259.122802734375,
"debug/reference_chosen_logps": -278.84912109375,
"debug/reference_rejected_logps": -258.3992919921875,
"epoch": 0.45454545454545453,
"grad_norm": 46.60154697948134,
"learning_rate": 1e-06,
"logits/chosen": -1.521999478340149,
"logits/rejected": -1.4620181322097778,
"logps/chosen": -278.55535888671875,
"logps/rejected": -259.122802734375,
"loss": 0.486,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0029377557802945375,
"rewards/margins": 0.010172786191105843,
"rewards/rejected": -0.007235030643641949,
"step": 20
},
{
"debug/policy_chosen_logits": -1.1010836362838745,
"debug/policy_chosen_logps": -272.86285400390625,
"debug/policy_rejected_logits": -1.3311625719070435,
"debug/policy_rejected_logps": -271.2930603027344,
"debug/reference_chosen_logps": -269.4169921875,
"debug/reference_rejected_logps": -273.9818420410156,
"epoch": 0.4772727272727273,
"grad_norm": 17.643110187730265,
"learning_rate": 1e-06,
"logits/chosen": -1.1010836362838745,
"logits/rejected": -1.3311625719070435,
"logps/chosen": -272.86285400390625,
"logps/rejected": -271.2930603027344,
"loss": 0.5052,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.03445838764309883,
"rewards/margins": -0.06134599447250366,
"rewards/rejected": 0.02688760869204998,
"step": 21
},
{
"debug/policy_chosen_logits": -1.5071077346801758,
"debug/policy_chosen_logps": -241.43759155273438,
"debug/policy_rejected_logits": -1.1153310537338257,
"debug/policy_rejected_logps": -331.3087158203125,
"debug/reference_chosen_logps": -245.50453186035156,
"debug/reference_rejected_logps": -333.63140869140625,
"epoch": 0.5,
"grad_norm": 44.46366116213499,
"learning_rate": 1e-06,
"logits/chosen": -1.5071077346801758,
"logits/rejected": -1.1153310537338257,
"logps/chosen": -241.43759155273438,
"logps/rejected": -331.3087158203125,
"loss": 0.5036,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.040669478476047516,
"rewards/margins": 0.017442265525460243,
"rewards/rejected": 0.023227212950587273,
"step": 22
},
{
"debug/policy_chosen_logits": -1.451465368270874,
"debug/policy_chosen_logps": -212.37417602539062,
"debug/policy_rejected_logits": -1.3166741132736206,
"debug/policy_rejected_logps": -314.9155578613281,
"debug/reference_chosen_logps": -212.66213989257812,
"debug/reference_rejected_logps": -312.25823974609375,
"epoch": 0.5227272727272727,
"grad_norm": 20.606755035100644,
"learning_rate": 1e-06,
"logits/chosen": -1.451465368270874,
"logits/rejected": -1.3166741132736206,
"logps/chosen": -212.37417602539062,
"logps/rejected": -314.9155578613281,
"loss": 0.51,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.002879485720768571,
"rewards/margins": 0.029452495276927948,
"rewards/rejected": -0.02657300978899002,
"step": 23
},
{
"debug/policy_chosen_logits": -1.4294952154159546,
"debug/policy_chosen_logps": -204.76612854003906,
"debug/policy_rejected_logits": -1.4408621788024902,
"debug/policy_rejected_logps": -281.35980224609375,
"debug/reference_chosen_logps": -203.98245239257812,
"debug/reference_rejected_logps": -276.5864562988281,
"epoch": 0.5454545454545454,
"grad_norm": 38.391670513493104,
"learning_rate": 1e-06,
"logits/chosen": -1.4294952154159546,
"logits/rejected": -1.4408621788024902,
"logps/chosen": -204.76612854003906,
"logps/rejected": -281.35980224609375,
"loss": 0.4861,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.007836684584617615,
"rewards/margins": 0.03989652544260025,
"rewards/rejected": -0.047733210027217865,
"step": 24
},
{
"debug/policy_chosen_logits": -1.215004801750183,
"debug/policy_chosen_logps": -239.42361450195312,
"debug/policy_rejected_logits": -0.9678577780723572,
"debug/policy_rejected_logps": -284.46954345703125,
"debug/reference_chosen_logps": -234.15310668945312,
"debug/reference_rejected_logps": -274.2942199707031,
"epoch": 0.5681818181818182,
"grad_norm": 72.9941160097735,
"learning_rate": 1e-06,
"logits/chosen": -1.215004801750183,
"logits/rejected": -0.9678577780723572,
"logps/chosen": -239.42361450195312,
"logps/rejected": -284.46954345703125,
"loss": 0.4944,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.052704982459545135,
"rewards/margins": 0.04904847964644432,
"rewards/rejected": -0.10175345838069916,
"step": 25
},
{
"debug/policy_chosen_logits": -1.391048550605774,
"debug/policy_chosen_logps": -260.8648986816406,
"debug/policy_rejected_logits": -1.4560633897781372,
"debug/policy_rejected_logps": -346.93145751953125,
"debug/reference_chosen_logps": -255.39593505859375,
"debug/reference_rejected_logps": -336.768310546875,
"epoch": 0.5909090909090909,
"grad_norm": 36.44224120417283,
"learning_rate": 1e-06,
"logits/chosen": -1.391048550605774,
"logits/rejected": -1.4560633897781372,
"logps/chosen": -260.8648986816406,
"logps/rejected": -346.93145751953125,
"loss": 0.5006,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.05468965321779251,
"rewards/margins": 0.04694166034460068,
"rewards/rejected": -0.10163131356239319,
"step": 26
},
{
"debug/policy_chosen_logits": -1.5338735580444336,
"debug/policy_chosen_logps": -222.9837646484375,
"debug/policy_rejected_logits": -1.471374273300171,
"debug/policy_rejected_logps": -278.50897216796875,
"debug/reference_chosen_logps": -223.839599609375,
"debug/reference_rejected_logps": -273.5242919921875,
"epoch": 0.6136363636363636,
"grad_norm": 18.787474855932984,
"learning_rate": 1e-06,
"logits/chosen": -1.5338735580444336,
"logits/rejected": -1.471374273300171,
"logps/chosen": -222.9837646484375,
"logps/rejected": -278.50897216796875,
"loss": 0.4888,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.008558426052331924,
"rewards/margins": 0.05840524658560753,
"rewards/rejected": -0.049846820533275604,
"step": 27
},
{
"debug/policy_chosen_logits": -1.5032626390457153,
"debug/policy_chosen_logps": -267.772705078125,
"debug/policy_rejected_logits": -1.4030882120132446,
"debug/policy_rejected_logps": -366.5946350097656,
"debug/reference_chosen_logps": -263.9977722167969,
"debug/reference_rejected_logps": -362.62274169921875,
"epoch": 0.6363636363636364,
"grad_norm": 89.25136180706029,
"learning_rate": 1e-06,
"logits/chosen": -1.5032626390457153,
"logits/rejected": -1.4030882120132446,
"logps/chosen": -267.772705078125,
"logps/rejected": -366.5946350097656,
"loss": 0.5033,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.03774913772940636,
"rewards/margins": 0.0019695255905389786,
"rewards/rejected": -0.039718665182590485,
"step": 28
},
{
"debug/policy_chosen_logits": -1.3853955268859863,
"debug/policy_chosen_logps": -291.3934631347656,
"debug/policy_rejected_logits": -1.288482427597046,
"debug/policy_rejected_logps": -264.8169250488281,
"debug/reference_chosen_logps": -287.48876953125,
"debug/reference_rejected_logps": -263.4806823730469,
"epoch": 0.6590909090909091,
"grad_norm": 18.946606370821538,
"learning_rate": 1e-06,
"logits/chosen": -1.3853955268859863,
"logits/rejected": -1.288482427597046,
"logps/chosen": -291.3934631347656,
"logps/rejected": -264.8169250488281,
"loss": 0.4891,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.03904670476913452,
"rewards/margins": -0.025684315711259842,
"rewards/rejected": -0.013362388126552105,
"step": 29
},
{
"debug/policy_chosen_logits": -1.3024711608886719,
"debug/policy_chosen_logps": -327.91033935546875,
"debug/policy_rejected_logits": -1.5670039653778076,
"debug/policy_rejected_logps": -321.4190368652344,
"debug/reference_chosen_logps": -323.4125061035156,
"debug/reference_rejected_logps": -318.29742431640625,
"epoch": 0.6818181818181818,
"grad_norm": 67.52699012019576,
"learning_rate": 1e-06,
"logits/chosen": -1.3024711608886719,
"logits/rejected": -1.5670039653778076,
"logps/chosen": -327.91033935546875,
"logps/rejected": -321.4190368652344,
"loss": 0.4956,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.04497835040092468,
"rewards/margins": -0.013762515038251877,
"rewards/rejected": -0.031215837225317955,
"step": 30
},
{
"debug/policy_chosen_logits": -1.5216386318206787,
"debug/policy_chosen_logps": -257.83624267578125,
"debug/policy_rejected_logits": -1.4058371782302856,
"debug/policy_rejected_logps": -300.9150390625,
"debug/reference_chosen_logps": -257.880126953125,
"debug/reference_rejected_logps": -298.1737060546875,
"epoch": 0.7045454545454546,
"grad_norm": 20.060782775941146,
"learning_rate": 1e-06,
"logits/chosen": -1.5216386318206787,
"logits/rejected": -1.4058371782302856,
"logps/chosen": -257.83624267578125,
"logps/rejected": -300.9150390625,
"loss": 0.4864,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.00043886248022317886,
"rewards/margins": 0.027851875871419907,
"rewards/rejected": -0.027413014322519302,
"step": 31
},
{
"debug/policy_chosen_logits": -1.2779229879379272,
"debug/policy_chosen_logps": -201.38796997070312,
"debug/policy_rejected_logits": -1.156309962272644,
"debug/policy_rejected_logps": -257.59429931640625,
"debug/reference_chosen_logps": -206.09375,
"debug/reference_rejected_logps": -257.58074951171875,
"epoch": 0.7272727272727273,
"grad_norm": 75.52018869219997,
"learning_rate": 1e-06,
"logits/chosen": -1.2779229879379272,
"logits/rejected": -1.156309962272644,
"logps/chosen": -201.38796997070312,
"logps/rejected": -257.59429931640625,
"loss": 0.4865,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.04705776274204254,
"rewards/margins": 0.04719316214323044,
"rewards/rejected": -0.00013540498912334442,
"step": 32
},
{
"debug/policy_chosen_logits": -1.5799387693405151,
"debug/policy_chosen_logps": -251.5661163330078,
"debug/policy_rejected_logits": -1.2778719663619995,
"debug/policy_rejected_logps": -302.98028564453125,
"debug/reference_chosen_logps": -253.25216674804688,
"debug/reference_rejected_logps": -301.58880615234375,
"epoch": 0.75,
"grad_norm": 41.55558759207921,
"learning_rate": 1e-06,
"logits/chosen": -1.5799387693405151,
"logits/rejected": -1.2778719663619995,
"logps/chosen": -251.5661163330078,
"logps/rejected": -302.98028564453125,
"loss": 0.5161,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.016860580071806908,
"rewards/margins": 0.030775029212236404,
"rewards/rejected": -0.01391445193439722,
"step": 33
},
{
"debug/policy_chosen_logits": -1.3849577903747559,
"debug/policy_chosen_logps": -243.8576202392578,
"debug/policy_rejected_logits": -1.3409092426300049,
"debug/policy_rejected_logps": -280.698486328125,
"debug/reference_chosen_logps": -245.44943237304688,
"debug/reference_rejected_logps": -283.48333740234375,
"epoch": 0.7727272727272727,
"grad_norm": 14.362369526795225,
"learning_rate": 1e-06,
"logits/chosen": -1.3849577903747559,
"logits/rejected": -1.3409092426300049,
"logps/chosen": -243.8576202392578,
"logps/rejected": -280.698486328125,
"loss": 0.483,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.015918217599391937,
"rewards/margins": -0.011930178850889206,
"rewards/rejected": 0.027848394587635994,
"step": 34
},
{
"debug/policy_chosen_logits": -1.3460434675216675,
"debug/policy_chosen_logps": -281.47344970703125,
"debug/policy_rejected_logits": -1.1573445796966553,
"debug/policy_rejected_logps": -344.3446044921875,
"debug/reference_chosen_logps": -282.89141845703125,
"debug/reference_rejected_logps": -341.33868408203125,
"epoch": 0.7954545454545454,
"grad_norm": 13.1284617391076,
"learning_rate": 1e-06,
"logits/chosen": -1.3460434675216675,
"logits/rejected": -1.1573445796966553,
"logps/chosen": -281.47344970703125,
"logps/rejected": -344.3446044921875,
"loss": 0.4686,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.01417986024171114,
"rewards/margins": 0.04423864185810089,
"rewards/rejected": -0.030058782547712326,
"step": 35
},
{
"debug/policy_chosen_logits": -1.406193733215332,
"debug/policy_chosen_logps": -207.40655517578125,
"debug/policy_rejected_logits": -1.238633394241333,
"debug/policy_rejected_logps": -262.58428955078125,
"debug/reference_chosen_logps": -213.7412567138672,
"debug/reference_rejected_logps": -260.3867492675781,
"epoch": 0.8181818181818182,
"grad_norm": 32.43079232408406,
"learning_rate": 1e-06,
"logits/chosen": -1.406193733215332,
"logits/rejected": -1.238633394241333,
"logps/chosen": -207.40655517578125,
"logps/rejected": -262.58428955078125,
"loss": 0.4674,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.0633469745516777,
"rewards/margins": 0.08532249182462692,
"rewards/rejected": -0.02197551727294922,
"step": 36
},
{
"debug/policy_chosen_logits": -1.6442923545837402,
"debug/policy_chosen_logps": -258.9522705078125,
"debug/policy_rejected_logits": -1.4356112480163574,
"debug/policy_rejected_logps": -293.63323974609375,
"debug/reference_chosen_logps": -264.2718505859375,
"debug/reference_rejected_logps": -291.06591796875,
"epoch": 0.8409090909090909,
"grad_norm": 41.971026053468705,
"learning_rate": 1e-06,
"logits/chosen": -1.6442923545837402,
"logits/rejected": -1.4356112480163574,
"logps/chosen": -258.9522705078125,
"logps/rejected": -293.63323974609375,
"loss": 0.4832,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.05319575220346451,
"rewards/margins": 0.07886872440576553,
"rewards/rejected": -0.025672968477010727,
"step": 37
},
{
"debug/policy_chosen_logits": -1.5740854740142822,
"debug/policy_chosen_logps": -229.13720703125,
"debug/policy_rejected_logits": -1.4220911264419556,
"debug/policy_rejected_logps": -265.41192626953125,
"debug/reference_chosen_logps": -232.80499267578125,
"debug/reference_rejected_logps": -263.7989501953125,
"epoch": 0.8636363636363636,
"grad_norm": 13.989957061611456,
"learning_rate": 1e-06,
"logits/chosen": -1.5740854740142822,
"logits/rejected": -1.4220911264419556,
"logps/chosen": -229.13720703125,
"logps/rejected": -265.41192626953125,
"loss": 0.4567,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03667771816253662,
"rewards/margins": 0.05280757695436478,
"rewards/rejected": -0.016129855066537857,
"step": 38
},
{
"debug/policy_chosen_logits": -1.6223005056381226,
"debug/policy_chosen_logps": -289.09027099609375,
"debug/policy_rejected_logits": -1.5821017026901245,
"debug/policy_rejected_logps": -274.63519287109375,
"debug/reference_chosen_logps": -284.13433837890625,
"debug/reference_rejected_logps": -273.46636962890625,
"epoch": 0.8863636363636364,
"grad_norm": 15.865701906281847,
"learning_rate": 1e-06,
"logits/chosen": -1.6223005056381226,
"logits/rejected": -1.5821017026901245,
"logps/chosen": -289.09027099609375,
"logps/rejected": -274.63519287109375,
"loss": 0.491,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.049559421837329865,
"rewards/margins": -0.03787106275558472,
"rewards/rejected": -0.011688357219099998,
"step": 39
},
{
"debug/policy_chosen_logits": -1.445629358291626,
"debug/policy_chosen_logps": -197.75921630859375,
"debug/policy_rejected_logits": -1.4035519361495972,
"debug/policy_rejected_logps": -253.138916015625,
"debug/reference_chosen_logps": -195.52206420898438,
"debug/reference_rejected_logps": -253.06124877929688,
"epoch": 0.9090909090909091,
"grad_norm": 23.689590897900334,
"learning_rate": 1e-06,
"logits/chosen": -1.445629358291626,
"logits/rejected": -1.4035519361495972,
"logps/chosen": -197.75921630859375,
"logps/rejected": -253.138916015625,
"loss": 0.4872,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.022371472790837288,
"rewards/margins": -0.021594811230897903,
"rewards/rejected": -0.0007766615599393845,
"step": 40
},
{
"debug/policy_chosen_logits": -1.5050469636917114,
"debug/policy_chosen_logps": -240.60708618164062,
"debug/policy_rejected_logits": -1.2861061096191406,
"debug/policy_rejected_logps": -264.02239990234375,
"debug/reference_chosen_logps": -240.14486694335938,
"debug/reference_rejected_logps": -259.6829833984375,
"epoch": 0.9318181818181818,
"grad_norm": 19.441334795087283,
"learning_rate": 1e-06,
"logits/chosen": -1.5050469636917114,
"logits/rejected": -1.2861061096191406,
"logps/chosen": -240.60708618164062,
"logps/rejected": -264.02239990234375,
"loss": 0.4664,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.004622154403477907,
"rewards/margins": 0.038772162050008774,
"rewards/rejected": -0.04339431971311569,
"step": 41
},
{
"debug/policy_chosen_logits": -1.679877519607544,
"debug/policy_chosen_logps": -234.7102813720703,
"debug/policy_rejected_logits": -1.480136752128601,
"debug/policy_rejected_logps": -351.16754150390625,
"debug/reference_chosen_logps": -227.78549194335938,
"debug/reference_rejected_logps": -342.3416442871094,
"epoch": 0.9545454545454546,
"grad_norm": 50.79320866145651,
"learning_rate": 1e-06,
"logits/chosen": -1.679877519607544,
"logits/rejected": -1.480136752128601,
"logps/chosen": -234.7102813720703,
"logps/rejected": -351.16754150390625,
"loss": 0.4954,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0692477598786354,
"rewards/margins": 0.019011324271559715,
"rewards/rejected": -0.08825908601284027,
"step": 42
},
{
"debug/policy_chosen_logits": -1.6371181011199951,
"debug/policy_chosen_logps": -240.50299072265625,
"debug/policy_rejected_logits": -1.487899899482727,
"debug/policy_rejected_logps": -341.2508239746094,
"debug/reference_chosen_logps": -238.32655334472656,
"debug/reference_rejected_logps": -331.7200622558594,
"epoch": 0.9772727272727273,
"grad_norm": 20.71302449718998,
"learning_rate": 1e-06,
"logits/chosen": -1.6371181011199951,
"logits/rejected": -1.487899899482727,
"logps/chosen": -240.50299072265625,
"logps/rejected": -341.2508239746094,
"loss": 0.4785,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.021764487028121948,
"rewards/margins": 0.07354332506656647,
"rewards/rejected": -0.09530781209468842,
"step": 43
},
{
"debug/policy_chosen_logits": -1.2511305809020996,
"debug/policy_chosen_logps": -212.03353881835938,
"debug/policy_rejected_logits": -1.1813651323318481,
"debug/policy_rejected_logps": -274.3526611328125,
"debug/reference_chosen_logps": -210.2974853515625,
"debug/reference_rejected_logps": -270.35479736328125,
"epoch": 1.0,
"grad_norm": 65.73660428884652,
"learning_rate": 1e-06,
"logits/chosen": -1.2511305809020996,
"logits/rejected": -1.1813651323318481,
"logps/chosen": -212.03353881835938,
"logps/rejected": -274.3526611328125,
"loss": 0.4684,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.017360497266054153,
"rewards/margins": 0.022618159651756287,
"rewards/rejected": -0.03997865691781044,
"step": 44
},
{
"epoch": 1.0,
"step": 44,
"total_flos": 0.0,
"train_loss": 0.4935350160707127,
"train_runtime": 155.2111,
"train_samples_per_second": 17.937,
"train_steps_per_second": 0.283
}
],
"logging_steps": 1,
"max_steps": 44,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}