qwen2_chat_reflct_adamw_iter4 / trainer_state.json
yiran-wang3's picture
End of training
3251614 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 43,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": -1.5687581300735474,
"debug/policy_chosen_logps": -240.2513427734375,
"debug/policy_rejected_logits": -1.6221139430999756,
"debug/policy_rejected_logps": -264.4752197265625,
"debug/reference_chosen_logps": -240.2513427734375,
"debug/reference_rejected_logps": -264.4752197265625,
"epoch": 0.023255813953488372,
"grad_norm": 14.314275545525218,
"learning_rate": 1e-06,
"logits/chosen": -1.5687581300735474,
"logits/rejected": -1.6221139430999756,
"logps/chosen": -240.2513427734375,
"logps/rejected": -264.4752197265625,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": -1.4775172472000122,
"debug/policy_chosen_logps": -235.343994140625,
"debug/policy_rejected_logits": -1.3492165803909302,
"debug/policy_rejected_logps": -283.1033935546875,
"debug/reference_chosen_logps": -234.93467712402344,
"debug/reference_rejected_logps": -283.2170104980469,
"epoch": 0.046511627906976744,
"grad_norm": 18.67922014806989,
"learning_rate": 1e-06,
"logits/chosen": -1.4775172472000122,
"logits/rejected": -1.3492165803909302,
"logps/chosen": -235.343994140625,
"logps/rejected": -283.1033935546875,
"loss": 0.4959,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.004093170166015625,
"rewards/margins": -0.005229205824434757,
"rewards/rejected": 0.0011360361240804195,
"step": 2
},
{
"debug/policy_chosen_logits": -1.6865235567092896,
"debug/policy_chosen_logps": -230.66635131835938,
"debug/policy_rejected_logits": -1.6258912086486816,
"debug/policy_rejected_logps": -228.0758514404297,
"debug/reference_chosen_logps": -225.64306640625,
"debug/reference_rejected_logps": -223.4805908203125,
"epoch": 0.06976744186046512,
"grad_norm": 25.946369831783773,
"learning_rate": 1e-06,
"logits/chosen": -1.6865235567092896,
"logits/rejected": -1.6258912086486816,
"logps/chosen": -230.66635131835938,
"logps/rejected": -228.0758514404297,
"loss": 0.5069,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.050232600420713425,
"rewards/margins": -0.004280166234821081,
"rewards/rejected": -0.04595243185758591,
"step": 3
},
{
"debug/policy_chosen_logits": -1.6345511674880981,
"debug/policy_chosen_logps": -230.35598754882812,
"debug/policy_rejected_logits": -1.594412088394165,
"debug/policy_rejected_logps": -235.36544799804688,
"debug/reference_chosen_logps": -227.8475799560547,
"debug/reference_rejected_logps": -230.77169799804688,
"epoch": 0.09302325581395349,
"grad_norm": 12.562991726069878,
"learning_rate": 1e-06,
"logits/chosen": -1.6345511674880981,
"logits/rejected": -1.594412088394165,
"logps/chosen": -230.35598754882812,
"logps/rejected": -235.36544799804688,
"loss": 0.4907,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.025084247812628746,
"rewards/margins": 0.02085309848189354,
"rewards/rejected": -0.045937344431877136,
"step": 4
},
{
"debug/policy_chosen_logits": -1.5187644958496094,
"debug/policy_chosen_logps": -209.38815307617188,
"debug/policy_rejected_logits": -1.5565170049667358,
"debug/policy_rejected_logps": -261.0048522949219,
"debug/reference_chosen_logps": -204.9683837890625,
"debug/reference_rejected_logps": -256.2153015136719,
"epoch": 0.11627906976744186,
"grad_norm": 42.7709320228073,
"learning_rate": 1e-06,
"logits/chosen": -1.5187644958496094,
"logits/rejected": -1.5565170049667358,
"logps/chosen": -209.38815307617188,
"logps/rejected": -261.0048522949219,
"loss": 0.5197,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.04419763758778572,
"rewards/margins": 0.0036978721618652344,
"rewards/rejected": -0.04789550602436066,
"step": 5
},
{
"debug/policy_chosen_logits": -1.654346227645874,
"debug/policy_chosen_logps": -208.22152709960938,
"debug/policy_rejected_logits": -1.472536325454712,
"debug/policy_rejected_logps": -277.9122314453125,
"debug/reference_chosen_logps": -208.6928253173828,
"debug/reference_rejected_logps": -277.05023193359375,
"epoch": 0.13953488372093023,
"grad_norm": 18.148530267479675,
"learning_rate": 1e-06,
"logits/chosen": -1.654346227645874,
"logits/rejected": -1.472536325454712,
"logps/chosen": -208.22152709960938,
"logps/rejected": -277.9122314453125,
"loss": 0.507,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0047130584716796875,
"rewards/margins": 0.013332920148968697,
"rewards/rejected": -0.008619861677289009,
"step": 6
},
{
"debug/policy_chosen_logits": -1.619295597076416,
"debug/policy_chosen_logps": -240.17440795898438,
"debug/policy_rejected_logits": -1.5930582284927368,
"debug/policy_rejected_logps": -303.7572326660156,
"debug/reference_chosen_logps": -240.71119689941406,
"debug/reference_rejected_logps": -304.2488708496094,
"epoch": 0.16279069767441862,
"grad_norm": 18.12592422181744,
"learning_rate": 1e-06,
"logits/chosen": -1.619295597076416,
"logits/rejected": -1.5930582284927368,
"logps/chosen": -240.17440795898438,
"logps/rejected": -303.7572326660156,
"loss": 0.5014,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.005367736332118511,
"rewards/margins": 0.0004512788727879524,
"rewards/rejected": 0.004916457924991846,
"step": 7
},
{
"debug/policy_chosen_logits": -1.5426918268203735,
"debug/policy_chosen_logps": -242.49334716796875,
"debug/policy_rejected_logits": -1.515419363975525,
"debug/policy_rejected_logps": -246.33676147460938,
"debug/reference_chosen_logps": -245.80419921875,
"debug/reference_rejected_logps": -248.84983825683594,
"epoch": 0.18604651162790697,
"grad_norm": 37.16270378133235,
"learning_rate": 1e-06,
"logits/chosen": -1.5426918268203735,
"logits/rejected": -1.515419363975525,
"logps/chosen": -242.49334716796875,
"logps/rejected": -246.33676147460938,
"loss": 0.4963,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03310825303196907,
"rewards/margins": 0.007977409288287163,
"rewards/rejected": 0.02513084188103676,
"step": 8
},
{
"debug/policy_chosen_logits": -1.5298963785171509,
"debug/policy_chosen_logps": -225.92041015625,
"debug/policy_rejected_logits": -1.4147241115570068,
"debug/policy_rejected_logps": -270.5355224609375,
"debug/reference_chosen_logps": -228.79443359375,
"debug/reference_rejected_logps": -272.68603515625,
"epoch": 0.20930232558139536,
"grad_norm": 18.861065558487233,
"learning_rate": 1e-06,
"logits/chosen": -1.5298963785171509,
"logits/rejected": -1.4147241115570068,
"logps/chosen": -225.92041015625,
"logps/rejected": -270.5355224609375,
"loss": 0.5,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0287402905523777,
"rewards/margins": 0.007235164754092693,
"rewards/rejected": 0.021505124866962433,
"step": 9
},
{
"debug/policy_chosen_logits": -1.5066547393798828,
"debug/policy_chosen_logps": -216.030517578125,
"debug/policy_rejected_logits": -1.4625401496887207,
"debug/policy_rejected_logps": -217.58367919921875,
"debug/reference_chosen_logps": -219.08502197265625,
"debug/reference_rejected_logps": -219.8885955810547,
"epoch": 0.23255813953488372,
"grad_norm": 12.691840821738246,
"learning_rate": 1e-06,
"logits/chosen": -1.5066547393798828,
"logits/rejected": -1.4625401496887207,
"logps/chosen": -216.030517578125,
"logps/rejected": -217.58367919921875,
"loss": 0.4982,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.030545100569725037,
"rewards/margins": 0.007496070582419634,
"rewards/rejected": 0.02304903045296669,
"step": 10
},
{
"debug/policy_chosen_logits": -1.6109825372695923,
"debug/policy_chosen_logps": -194.70681762695312,
"debug/policy_rejected_logits": -1.5127055644989014,
"debug/policy_rejected_logps": -261.20880126953125,
"debug/reference_chosen_logps": -200.0032958984375,
"debug/reference_rejected_logps": -264.6978454589844,
"epoch": 0.2558139534883721,
"grad_norm": 34.88211840288691,
"learning_rate": 1e-06,
"logits/chosen": -1.6109825372695923,
"logits/rejected": -1.5127055644989014,
"logps/chosen": -194.70681762695312,
"logps/rejected": -261.20880126953125,
"loss": 0.5055,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05296493321657181,
"rewards/margins": 0.018074415624141693,
"rewards/rejected": 0.034890517592430115,
"step": 11
},
{
"debug/policy_chosen_logits": -1.644713282585144,
"debug/policy_chosen_logps": -242.20831298828125,
"debug/policy_rejected_logits": -1.6428948640823364,
"debug/policy_rejected_logps": -256.0648498535156,
"debug/reference_chosen_logps": -245.20326232910156,
"debug/reference_rejected_logps": -257.87481689453125,
"epoch": 0.27906976744186046,
"grad_norm": 24.62141741438646,
"learning_rate": 1e-06,
"logits/chosen": -1.644713282585144,
"logits/rejected": -1.6428948640823364,
"logps/chosen": -242.20831298828125,
"logps/rejected": -256.0648498535156,
"loss": 0.507,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.029949625954031944,
"rewards/margins": 0.0118501465767622,
"rewards/rejected": 0.018099479377269745,
"step": 12
},
{
"debug/policy_chosen_logits": -1.6160894632339478,
"debug/policy_chosen_logps": -227.63302612304688,
"debug/policy_rejected_logits": -1.6384341716766357,
"debug/policy_rejected_logps": -264.2388916015625,
"debug/reference_chosen_logps": -230.62490844726562,
"debug/reference_rejected_logps": -264.9801025390625,
"epoch": 0.3023255813953488,
"grad_norm": 10.776881537717472,
"learning_rate": 1e-06,
"logits/chosen": -1.6160894632339478,
"logits/rejected": -1.6384341716766357,
"logps/chosen": -227.63302612304688,
"logps/rejected": -264.2388916015625,
"loss": 0.4976,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.029918955639004707,
"rewards/margins": 0.022506674751639366,
"rewards/rejected": 0.007412281818687916,
"step": 13
},
{
"debug/policy_chosen_logits": -1.6279401779174805,
"debug/policy_chosen_logps": -220.282958984375,
"debug/policy_rejected_logits": -1.4893845319747925,
"debug/policy_rejected_logps": -272.10931396484375,
"debug/reference_chosen_logps": -222.31028747558594,
"debug/reference_rejected_logps": -272.71044921875,
"epoch": 0.32558139534883723,
"grad_norm": 10.036756062227226,
"learning_rate": 1e-06,
"logits/chosen": -1.6279401779174805,
"logits/rejected": -1.4893845319747925,
"logps/chosen": -220.282958984375,
"logps/rejected": -272.10931396484375,
"loss": 0.4935,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.020273476839065552,
"rewards/margins": 0.014262351207435131,
"rewards/rejected": 0.006011123303323984,
"step": 14
},
{
"debug/policy_chosen_logits": -1.5787980556488037,
"debug/policy_chosen_logps": -261.0044250488281,
"debug/policy_rejected_logits": -1.3867720365524292,
"debug/policy_rejected_logps": -301.5718994140625,
"debug/reference_chosen_logps": -258.84735107421875,
"debug/reference_rejected_logps": -297.58404541015625,
"epoch": 0.3488372093023256,
"grad_norm": 10.65558553278192,
"learning_rate": 1e-06,
"logits/chosen": -1.5787980556488037,
"logits/rejected": -1.3867720365524292,
"logps/chosen": -261.0044250488281,
"logps/rejected": -301.5718994140625,
"loss": 0.4873,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.021570798009634018,
"rewards/margins": 0.018307799473404884,
"rewards/rejected": -0.03987859562039375,
"step": 15
},
{
"debug/policy_chosen_logits": -1.5956577062606812,
"debug/policy_chosen_logps": -222.5416259765625,
"debug/policy_rejected_logits": -1.4434815645217896,
"debug/policy_rejected_logps": -279.348388671875,
"debug/reference_chosen_logps": -221.23260498046875,
"debug/reference_rejected_logps": -273.97540283203125,
"epoch": 0.37209302325581395,
"grad_norm": 36.49853384207237,
"learning_rate": 1e-06,
"logits/chosen": -1.5956577062606812,
"logits/rejected": -1.4434815645217896,
"logps/chosen": -222.5416259765625,
"logps/rejected": -279.348388671875,
"loss": 0.5003,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.013090074062347412,
"rewards/margins": 0.040639691054821014,
"rewards/rejected": -0.053729765117168427,
"step": 16
},
{
"debug/policy_chosen_logits": -1.4420965909957886,
"debug/policy_chosen_logps": -215.4423828125,
"debug/policy_rejected_logits": -1.5232738256454468,
"debug/policy_rejected_logps": -288.1341552734375,
"debug/reference_chosen_logps": -213.68832397460938,
"debug/reference_rejected_logps": -286.45086669921875,
"epoch": 0.3953488372093023,
"grad_norm": 24.203474268576745,
"learning_rate": 1e-06,
"logits/chosen": -1.4420965909957886,
"logits/rejected": -1.5232738256454468,
"logps/chosen": -215.4423828125,
"logps/rejected": -288.1341552734375,
"loss": 0.4918,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.01754041761159897,
"rewards/margins": -0.0007077232003211975,
"rewards/rejected": -0.01683269441127777,
"step": 17
},
{
"debug/policy_chosen_logits": -1.4279348850250244,
"debug/policy_chosen_logps": -242.89749145507812,
"debug/policy_rejected_logits": -1.3261935710906982,
"debug/policy_rejected_logps": -230.28863525390625,
"debug/reference_chosen_logps": -240.6783447265625,
"debug/reference_rejected_logps": -226.51815795898438,
"epoch": 0.4186046511627907,
"grad_norm": 44.37824041962939,
"learning_rate": 1e-06,
"logits/chosen": -1.4279348850250244,
"logits/rejected": -1.3261935710906982,
"logps/chosen": -242.89749145507812,
"logps/rejected": -230.28863525390625,
"loss": 0.5044,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.022191638126969337,
"rewards/margins": 0.01551321055740118,
"rewards/rejected": -0.03770485147833824,
"step": 18
},
{
"debug/policy_chosen_logits": -1.58405339717865,
"debug/policy_chosen_logps": -236.5458984375,
"debug/policy_rejected_logits": -1.5539088249206543,
"debug/policy_rejected_logps": -233.43719482421875,
"debug/reference_chosen_logps": -236.3321533203125,
"debug/reference_rejected_logps": -231.97726440429688,
"epoch": 0.4418604651162791,
"grad_norm": 12.540633658003186,
"learning_rate": 1e-06,
"logits/chosen": -1.58405339717865,
"logits/rejected": -1.5539088249206543,
"logps/chosen": -236.5458984375,
"logps/rejected": -233.43719482421875,
"loss": 0.5059,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.002137584611773491,
"rewards/margins": 0.012461718171834946,
"rewards/rejected": -0.014599304646253586,
"step": 19
},
{
"debug/policy_chosen_logits": -1.466562032699585,
"debug/policy_chosen_logps": -194.52447509765625,
"debug/policy_rejected_logits": -1.4163392782211304,
"debug/policy_rejected_logps": -237.53216552734375,
"debug/reference_chosen_logps": -194.98049926757812,
"debug/reference_rejected_logps": -237.75314331054688,
"epoch": 0.46511627906976744,
"grad_norm": 14.133924214011163,
"learning_rate": 1e-06,
"logits/chosen": -1.466562032699585,
"logits/rejected": -1.4163392782211304,
"logps/chosen": -194.52447509765625,
"logps/rejected": -237.53216552734375,
"loss": 0.4895,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.004560394212603569,
"rewards/margins": 0.00235048308968544,
"rewards/rejected": 0.0022099113557487726,
"step": 20
},
{
"debug/policy_chosen_logits": -1.5900827646255493,
"debug/policy_chosen_logps": -212.4789276123047,
"debug/policy_rejected_logits": -1.4683177471160889,
"debug/policy_rejected_logps": -257.5311279296875,
"debug/reference_chosen_logps": -211.9457550048828,
"debug/reference_rejected_logps": -254.302490234375,
"epoch": 0.4883720930232558,
"grad_norm": 17.882111129846162,
"learning_rate": 1e-06,
"logits/chosen": -1.5900827646255493,
"logits/rejected": -1.4683177471160889,
"logps/chosen": -212.4789276123047,
"logps/rejected": -257.5311279296875,
"loss": 0.4799,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.00533168762922287,
"rewards/margins": 0.026954688131809235,
"rewards/rejected": -0.032286375761032104,
"step": 21
},
{
"debug/policy_chosen_logits": -1.4656310081481934,
"debug/policy_chosen_logps": -263.0152282714844,
"debug/policy_rejected_logits": -1.3645009994506836,
"debug/policy_rejected_logps": -283.24566650390625,
"debug/reference_chosen_logps": -262.25970458984375,
"debug/reference_rejected_logps": -280.8420715332031,
"epoch": 0.5116279069767442,
"grad_norm": 17.255770475731207,
"learning_rate": 1e-06,
"logits/chosen": -1.4656310081481934,
"logits/rejected": -1.3645009994506836,
"logps/chosen": -263.0152282714844,
"logps/rejected": -283.24566650390625,
"loss": 0.479,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.007554950192570686,
"rewards/margins": 0.01648113504052162,
"rewards/rejected": -0.02403608150780201,
"step": 22
},
{
"debug/policy_chosen_logits": -1.4521846771240234,
"debug/policy_chosen_logps": -216.39169311523438,
"debug/policy_rejected_logits": -1.3643300533294678,
"debug/policy_rejected_logps": -281.2818603515625,
"debug/reference_chosen_logps": -223.86587524414062,
"debug/reference_rejected_logps": -278.6108703613281,
"epoch": 0.5348837209302325,
"grad_norm": 22.42065485842657,
"learning_rate": 1e-06,
"logits/chosen": -1.4521846771240234,
"logits/rejected": -1.3643300533294678,
"logps/chosen": -216.39169311523438,
"logps/rejected": -281.2818603515625,
"loss": 0.4761,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.07474187761545181,
"rewards/margins": 0.10145200788974762,
"rewards/rejected": -0.02671012654900551,
"step": 23
},
{
"debug/policy_chosen_logits": -1.596596121788025,
"debug/policy_chosen_logps": -240.6717987060547,
"debug/policy_rejected_logits": -1.5593181848526,
"debug/policy_rejected_logps": -339.8119812011719,
"debug/reference_chosen_logps": -242.0146484375,
"debug/reference_rejected_logps": -335.9112548828125,
"epoch": 0.5581395348837209,
"grad_norm": 15.69048628226096,
"learning_rate": 1e-06,
"logits/chosen": -1.596596121788025,
"logits/rejected": -1.5593181848526,
"logps/chosen": -240.6717987060547,
"logps/rejected": -339.8119812011719,
"loss": 0.4765,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.013428498059511185,
"rewards/margins": 0.05243583396077156,
"rewards/rejected": -0.039007339626550674,
"step": 24
},
{
"debug/policy_chosen_logits": -1.5627775192260742,
"debug/policy_chosen_logps": -243.03045654296875,
"debug/policy_rejected_logits": -1.5146582126617432,
"debug/policy_rejected_logps": -339.4427490234375,
"debug/reference_chosen_logps": -244.926513671875,
"debug/reference_rejected_logps": -337.1531982421875,
"epoch": 0.5813953488372093,
"grad_norm": 23.599376538578916,
"learning_rate": 1e-06,
"logits/chosen": -1.5627775192260742,
"logits/rejected": -1.5146582126617432,
"logps/chosen": -243.03045654296875,
"logps/rejected": -339.4427490234375,
"loss": 0.4976,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.018960533663630486,
"rewards/margins": 0.04185573384165764,
"rewards/rejected": -0.02289520390331745,
"step": 25
},
{
"debug/policy_chosen_logits": -1.4482909440994263,
"debug/policy_chosen_logps": -243.66656494140625,
"debug/policy_rejected_logits": -1.3864490985870361,
"debug/policy_rejected_logps": -269.7567138671875,
"debug/reference_chosen_logps": -243.5550079345703,
"debug/reference_rejected_logps": -265.46270751953125,
"epoch": 0.6046511627906976,
"grad_norm": 15.856946618319062,
"learning_rate": 1e-06,
"logits/chosen": -1.4482909440994263,
"logits/rejected": -1.3864490985870361,
"logps/chosen": -243.66656494140625,
"logps/rejected": -269.7567138671875,
"loss": 0.4718,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.001115493942052126,
"rewards/margins": 0.04182462394237518,
"rewards/rejected": -0.04294012114405632,
"step": 26
},
{
"debug/policy_chosen_logits": -1.6767849922180176,
"debug/policy_chosen_logps": -235.87283325195312,
"debug/policy_rejected_logits": -1.4402155876159668,
"debug/policy_rejected_logps": -255.25155639648438,
"debug/reference_chosen_logps": -234.12271118164062,
"debug/reference_rejected_logps": -252.92681884765625,
"epoch": 0.627906976744186,
"grad_norm": 26.8171661009806,
"learning_rate": 1e-06,
"logits/chosen": -1.6767849922180176,
"logits/rejected": -1.4402155876159668,
"logps/chosen": -235.87283325195312,
"logps/rejected": -255.25155639648438,
"loss": 0.4911,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.017501164227724075,
"rewards/margins": 0.005746154114603996,
"rewards/rejected": -0.023247316479682922,
"step": 27
},
{
"debug/policy_chosen_logits": -1.474959135055542,
"debug/policy_chosen_logps": -250.03805541992188,
"debug/policy_rejected_logits": -1.33174729347229,
"debug/policy_rejected_logps": -275.7778015136719,
"debug/reference_chosen_logps": -246.68450927734375,
"debug/reference_rejected_logps": -271.7251892089844,
"epoch": 0.6511627906976745,
"grad_norm": 23.009106257244806,
"learning_rate": 1e-06,
"logits/chosen": -1.474959135055542,
"logits/rejected": -1.33174729347229,
"logps/chosen": -250.03805541992188,
"logps/rejected": -275.7778015136719,
"loss": 0.4741,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.03353559225797653,
"rewards/margins": 0.006990719586610794,
"rewards/rejected": -0.040526311844587326,
"step": 28
},
{
"debug/policy_chosen_logits": -1.5180094242095947,
"debug/policy_chosen_logps": -239.39962768554688,
"debug/policy_rejected_logits": -1.3486112356185913,
"debug/policy_rejected_logps": -280.152587890625,
"debug/reference_chosen_logps": -235.7870635986328,
"debug/reference_rejected_logps": -271.69512939453125,
"epoch": 0.6744186046511628,
"grad_norm": 29.559799359310976,
"learning_rate": 1e-06,
"logits/chosen": -1.5180094242095947,
"logits/rejected": -1.3486112356185913,
"logps/chosen": -239.39962768554688,
"logps/rejected": -280.152587890625,
"loss": 0.4648,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.03612573444843292,
"rewards/margins": 0.04844905436038971,
"rewards/rejected": -0.08457479625940323,
"step": 29
},
{
"debug/policy_chosen_logits": -1.5323574542999268,
"debug/policy_chosen_logps": -230.34732055664062,
"debug/policy_rejected_logits": -1.4196269512176514,
"debug/policy_rejected_logps": -299.549072265625,
"debug/reference_chosen_logps": -227.33663940429688,
"debug/reference_rejected_logps": -290.41229248046875,
"epoch": 0.6976744186046512,
"grad_norm": 24.104606390659608,
"learning_rate": 1e-06,
"logits/chosen": -1.5323574542999268,
"logits/rejected": -1.4196269512176514,
"logps/chosen": -230.34732055664062,
"logps/rejected": -299.549072265625,
"loss": 0.4845,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03010694310069084,
"rewards/margins": 0.0612606406211853,
"rewards/rejected": -0.09136758744716644,
"step": 30
},
{
"debug/policy_chosen_logits": -1.5471208095550537,
"debug/policy_chosen_logps": -233.48435974121094,
"debug/policy_rejected_logits": -1.507702350616455,
"debug/policy_rejected_logps": -299.49298095703125,
"debug/reference_chosen_logps": -230.37808227539062,
"debug/reference_rejected_logps": -290.85491943359375,
"epoch": 0.7209302325581395,
"grad_norm": 11.501362477806966,
"learning_rate": 1e-06,
"logits/chosen": -1.5471208095550537,
"logits/rejected": -1.507702350616455,
"logps/chosen": -233.48435974121094,
"logps/rejected": -299.49298095703125,
"loss": 0.4777,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.031062887981534004,
"rewards/margins": 0.05531751364469528,
"rewards/rejected": -0.08638040721416473,
"step": 31
},
{
"debug/policy_chosen_logits": -1.3573694229125977,
"debug/policy_chosen_logps": -276.0484619140625,
"debug/policy_rejected_logits": -1.4295967817306519,
"debug/policy_rejected_logps": -251.19253540039062,
"debug/reference_chosen_logps": -269.07147216796875,
"debug/reference_rejected_logps": -245.41119384765625,
"epoch": 0.7441860465116279,
"grad_norm": 44.933121351184646,
"learning_rate": 1e-06,
"logits/chosen": -1.3573694229125977,
"logits/rejected": -1.4295967817306519,
"logps/chosen": -276.0484619140625,
"logps/rejected": -251.19253540039062,
"loss": 0.4882,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06977000832557678,
"rewards/margins": -0.011956671252846718,
"rewards/rejected": -0.057813338935375214,
"step": 32
},
{
"debug/policy_chosen_logits": -1.532698392868042,
"debug/policy_chosen_logps": -210.01528930664062,
"debug/policy_rejected_logits": -1.5289320945739746,
"debug/policy_rejected_logps": -266.588134765625,
"debug/reference_chosen_logps": -212.3720245361328,
"debug/reference_rejected_logps": -261.5830993652344,
"epoch": 0.7674418604651163,
"grad_norm": 16.92613442301389,
"learning_rate": 1e-06,
"logits/chosen": -1.532698392868042,
"logits/rejected": -1.5289320945739746,
"logps/chosen": -210.01528930664062,
"logps/rejected": -266.588134765625,
"loss": 0.4701,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.023567447438836098,
"rewards/margins": 0.0736178606748581,
"rewards/rejected": -0.050050411373376846,
"step": 33
},
{
"debug/policy_chosen_logits": -1.6075690984725952,
"debug/policy_chosen_logps": -222.28057861328125,
"debug/policy_rejected_logits": -1.5301072597503662,
"debug/policy_rejected_logps": -231.0042724609375,
"debug/reference_chosen_logps": -221.35081481933594,
"debug/reference_rejected_logps": -232.4139404296875,
"epoch": 0.7906976744186046,
"grad_norm": 30.286832542440518,
"learning_rate": 1e-06,
"logits/chosen": -1.6075690984725952,
"logits/rejected": -1.5301072597503662,
"logps/chosen": -222.28057861328125,
"logps/rejected": -231.0042724609375,
"loss": 0.4957,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.009297618642449379,
"rewards/margins": -0.023394297808408737,
"rewards/rejected": 0.014096679165959358,
"step": 34
},
{
"debug/policy_chosen_logits": -1.6365246772766113,
"debug/policy_chosen_logps": -214.15689086914062,
"debug/policy_rejected_logits": -1.3568267822265625,
"debug/policy_rejected_logps": -274.84320068359375,
"debug/reference_chosen_logps": -219.61041259765625,
"debug/reference_rejected_logps": -271.06793212890625,
"epoch": 0.813953488372093,
"grad_norm": 10.413515346816709,
"learning_rate": 1e-06,
"logits/chosen": -1.6365246772766113,
"logits/rejected": -1.3568267822265625,
"logps/chosen": -214.15689086914062,
"logps/rejected": -274.84320068359375,
"loss": 0.4745,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.05453508347272873,
"rewards/margins": 0.09228822588920593,
"rewards/rejected": -0.0377531424164772,
"step": 35
},
{
"debug/policy_chosen_logits": -1.4882985353469849,
"debug/policy_chosen_logps": -224.943115234375,
"debug/policy_rejected_logits": -1.5247392654418945,
"debug/policy_rejected_logps": -291.1658020019531,
"debug/reference_chosen_logps": -230.88302612304688,
"debug/reference_rejected_logps": -292.3128967285156,
"epoch": 0.8372093023255814,
"grad_norm": 21.658575078370163,
"learning_rate": 1e-06,
"logits/chosen": -1.4882985353469849,
"logits/rejected": -1.5247392654418945,
"logps/chosen": -224.943115234375,
"logps/rejected": -291.1658020019531,
"loss": 0.4603,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.059399355202913284,
"rewards/margins": 0.04792825132608414,
"rewards/rejected": 0.011471100151538849,
"step": 36
},
{
"debug/policy_chosen_logits": -1.5648789405822754,
"debug/policy_chosen_logps": -232.97958374023438,
"debug/policy_rejected_logits": -1.4898722171783447,
"debug/policy_rejected_logps": -264.453369140625,
"debug/reference_chosen_logps": -236.86109924316406,
"debug/reference_rejected_logps": -266.53448486328125,
"epoch": 0.8604651162790697,
"grad_norm": 13.183542989418173,
"learning_rate": 1e-06,
"logits/chosen": -1.5648789405822754,
"logits/rejected": -1.4898722171783447,
"logps/chosen": -232.97958374023438,
"logps/rejected": -264.453369140625,
"loss": 0.4871,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03881513699889183,
"rewards/margins": 0.018004285171628,
"rewards/rejected": 0.020810849964618683,
"step": 37
},
{
"debug/policy_chosen_logits": -1.3791043758392334,
"debug/policy_chosen_logps": -228.21205139160156,
"debug/policy_rejected_logits": -1.3903659582138062,
"debug/policy_rejected_logps": -227.752197265625,
"debug/reference_chosen_logps": -230.1770477294922,
"debug/reference_rejected_logps": -230.64239501953125,
"epoch": 0.8837209302325582,
"grad_norm": 17.81188100770416,
"learning_rate": 1e-06,
"logits/chosen": -1.3791043758392334,
"logits/rejected": -1.3903659582138062,
"logps/chosen": -228.21205139160156,
"logps/rejected": -227.752197265625,
"loss": 0.4778,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.019649982452392578,
"rewards/margins": -0.009251842275261879,
"rewards/rejected": 0.028901822865009308,
"step": 38
},
{
"debug/policy_chosen_logits": -1.4482218027114868,
"debug/policy_chosen_logps": -247.0265655517578,
"debug/policy_rejected_logits": -1.3742115497589111,
"debug/policy_rejected_logps": -305.1549072265625,
"debug/reference_chosen_logps": -249.18740844726562,
"debug/reference_rejected_logps": -301.026123046875,
"epoch": 0.9069767441860465,
"grad_norm": 18.246091231595155,
"learning_rate": 1e-06,
"logits/chosen": -1.4482218027114868,
"logits/rejected": -1.3742115497589111,
"logps/chosen": -247.0265655517578,
"logps/rejected": -305.1549072265625,
"loss": 0.4569,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.021608371287584305,
"rewards/margins": 0.0628962367773056,
"rewards/rejected": -0.0412878580391407,
"step": 39
},
{
"debug/policy_chosen_logits": -1.5543980598449707,
"debug/policy_chosen_logps": -257.4435729980469,
"debug/policy_rejected_logits": -1.5073050260543823,
"debug/policy_rejected_logps": -296.2988586425781,
"debug/reference_chosen_logps": -256.39544677734375,
"debug/reference_rejected_logps": -288.50433349609375,
"epoch": 0.9302325581395349,
"grad_norm": 14.177163201466067,
"learning_rate": 1e-06,
"logits/chosen": -1.5543980598449707,
"logits/rejected": -1.5073050260543823,
"logps/chosen": -257.4435729980469,
"logps/rejected": -296.2988586425781,
"loss": 0.4851,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.010481302626430988,
"rewards/margins": 0.0674639493227005,
"rewards/rejected": -0.07794524729251862,
"step": 40
},
{
"debug/policy_chosen_logits": -1.5510001182556152,
"debug/policy_chosen_logps": -262.91461181640625,
"debug/policy_rejected_logits": -1.4993880987167358,
"debug/policy_rejected_logps": -288.2521667480469,
"debug/reference_chosen_logps": -253.84947204589844,
"debug/reference_rejected_logps": -285.00738525390625,
"epoch": 0.9534883720930233,
"grad_norm": 64.06356256852192,
"learning_rate": 1e-06,
"logits/chosen": -1.5510001182556152,
"logits/rejected": -1.4993880987167358,
"logps/chosen": -262.91461181640625,
"logps/rejected": -288.2521667480469,
"loss": 0.4995,
"rewards/accuracies": 0.125,
"rewards/chosen": -0.09065132588148117,
"rewards/margins": -0.05820371210575104,
"rewards/rejected": -0.032447606325149536,
"step": 41
},
{
"debug/policy_chosen_logits": -1.6003302335739136,
"debug/policy_chosen_logps": -224.11663818359375,
"debug/policy_rejected_logits": -1.5286082029342651,
"debug/policy_rejected_logps": -271.67974853515625,
"debug/reference_chosen_logps": -221.91209411621094,
"debug/reference_rejected_logps": -258.791015625,
"epoch": 0.9767441860465116,
"grad_norm": 61.90638727730458,
"learning_rate": 1e-06,
"logits/chosen": -1.6003302335739136,
"logits/rejected": -1.5286082029342651,
"logps/chosen": -224.11663818359375,
"logps/rejected": -271.67974853515625,
"loss": 0.4904,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.022045554593205452,
"rewards/margins": 0.10684183239936829,
"rewards/rejected": -0.1288873851299286,
"step": 42
},
{
"debug/policy_chosen_logits": -1.5732522010803223,
"debug/policy_chosen_logps": -237.1086883544922,
"debug/policy_rejected_logits": -1.405612826347351,
"debug/policy_rejected_logps": -320.40643310546875,
"debug/reference_chosen_logps": -234.65167236328125,
"debug/reference_rejected_logps": -311.38165283203125,
"epoch": 1.0,
"grad_norm": 43.37698691621295,
"learning_rate": 1e-06,
"logits/chosen": -1.5732522010803223,
"logits/rejected": -1.405612826347351,
"logps/chosen": -237.1086883544922,
"logps/rejected": -320.40643310546875,
"loss": 0.4906,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.024570178240537643,
"rewards/margins": 0.065677709877491,
"rewards/rejected": -0.09024789184331894,
"step": 43
},
{
"epoch": 1.0,
"step": 43,
"total_flos": 0.0,
"train_loss": 0.489490317743878,
"train_runtime": 149.7969,
"train_samples_per_second": 18.318,
"train_steps_per_second": 0.287
}
],
"logging_steps": 1,
"max_steps": 43,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}