llama-3-8b-instruct-sppo-iter3 / trainer_state.json
jcmei's picture
End of training
ec3ffcf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 100,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": 647945.4912541932,
"learning_rate": 1.5625e-08,
"logits/chosen": -0.34773391485214233,
"logits/rejected": -0.6075438261032104,
"logps/chosen": -72.6761474609375,
"logps/rejected": -90.11207580566406,
"loss": 128855.9062,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.032,
"grad_norm": 973324.1712020065,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -0.5611530542373657,
"logits/rejected": -0.5887401103973389,
"logps/chosen": -80.2381591796875,
"logps/rejected": -83.50374603271484,
"loss": 124005.5694,
"rewards/accuracies": 0.4166666567325592,
"rewards/chosen": -0.0019423539051786065,
"rewards/margins": 5.1506802265066653e-05,
"rewards/rejected": -0.0019938608165830374,
"step": 10
},
{
"epoch": 0.064,
"grad_norm": 619327.407060219,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.6772833466529846,
"logits/rejected": -0.6759974360466003,
"logps/chosen": -103.69559478759766,
"logps/rejected": -107.43603515625,
"loss": 124210.2125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.004284867085516453,
"rewards/margins": -3.467009082669392e-05,
"rewards/rejected": -0.004250196740031242,
"step": 20
},
{
"epoch": 0.096,
"grad_norm": 698173.4505162692,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -0.7464536428451538,
"logits/rejected": -0.7253994345664978,
"logps/chosen": -90.76727294921875,
"logps/rejected": -93.79044342041016,
"loss": 126548.2375,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.003106231102719903,
"rewards/margins": -0.0005979427369311452,
"rewards/rejected": -0.0025082884822040796,
"step": 30
},
{
"epoch": 0.128,
"grad_norm": 637174.9970357245,
"learning_rate": 4.857142857142857e-07,
"logits/chosen": -0.7085025906562805,
"logits/rejected": -0.7023540139198303,
"logps/chosen": -87.2509765625,
"logps/rejected": -88.0642318725586,
"loss": 124747.6875,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.007863897830247879,
"rewards/margins": -0.0013397409347817302,
"rewards/rejected": -0.006524157710373402,
"step": 40
},
{
"epoch": 0.16,
"grad_norm": 759040.4009588562,
"learning_rate": 4.6785714285714283e-07,
"logits/chosen": -0.5708094835281372,
"logits/rejected": -0.55577552318573,
"logps/chosen": -99.05384826660156,
"logps/rejected": -96.9248046875,
"loss": 127056.3875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.011573193594813347,
"rewards/margins": -0.0007376443827524781,
"rewards/rejected": -0.010835548862814903,
"step": 50
},
{
"epoch": 0.192,
"grad_norm": 818448.4874125579,
"learning_rate": 4.5e-07,
"logits/chosen": -0.5234788060188293,
"logits/rejected": -0.5684272646903992,
"logps/chosen": -84.0132064819336,
"logps/rejected": -89.70082092285156,
"loss": 124101.0125,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.011890527792274952,
"rewards/margins": 0.0017182690789923072,
"rewards/rejected": -0.013608796522021294,
"step": 60
},
{
"epoch": 0.224,
"grad_norm": 764315.259548912,
"learning_rate": 4.3214285714285713e-07,
"logits/chosen": -0.672571063041687,
"logits/rejected": -0.6554594039916992,
"logps/chosen": -102.6801986694336,
"logps/rejected": -114.0815658569336,
"loss": 125767.8,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.013903990387916565,
"rewards/margins": 0.0018995633581653237,
"rewards/rejected": -0.01580355316400528,
"step": 70
},
{
"epoch": 0.256,
"grad_norm": 792832.7721251897,
"learning_rate": 4.142857142857143e-07,
"logits/chosen": -0.6233155131340027,
"logits/rejected": -0.6050644516944885,
"logps/chosen": -89.83741760253906,
"logps/rejected": -96.45980072021484,
"loss": 126646.1,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.011440077796578407,
"rewards/margins": -0.0004714619426522404,
"rewards/rejected": -0.010968615300953388,
"step": 80
},
{
"epoch": 0.288,
"grad_norm": 810791.4710150602,
"learning_rate": 3.9642857142857137e-07,
"logits/chosen": -0.5288355946540833,
"logits/rejected": -0.507430911064148,
"logps/chosen": -77.9104232788086,
"logps/rejected": -74.20404052734375,
"loss": 126600.7625,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.005952201783657074,
"rewards/margins": -0.001560600707307458,
"rewards/rejected": -0.004391600843518972,
"step": 90
},
{
"epoch": 0.32,
"grad_norm": 612814.6572972395,
"learning_rate": 3.785714285714285e-07,
"logits/chosen": -0.6446259617805481,
"logits/rejected": -0.6776315569877625,
"logps/chosen": -92.22976684570312,
"logps/rejected": -100.54733276367188,
"loss": 124326.1,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.011474112048745155,
"rewards/margins": 0.002196565503254533,
"rewards/rejected": -0.013670678250491619,
"step": 100
},
{
"epoch": 0.352,
"grad_norm": 769940.7880329042,
"learning_rate": 3.607142857142857e-07,
"logits/chosen": -0.5441879630088806,
"logits/rejected": -0.5395065546035767,
"logps/chosen": -64.47439575195312,
"logps/rejected": -78.48651123046875,
"loss": 127264.1375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.008155420422554016,
"rewards/margins": 0.005157289560884237,
"rewards/rejected": -0.013312709517776966,
"step": 110
},
{
"epoch": 0.384,
"grad_norm": 781127.2959197527,
"learning_rate": 3.4285714285714286e-07,
"logits/chosen": -0.7074313759803772,
"logits/rejected": -0.6893147230148315,
"logps/chosen": -99.30326843261719,
"logps/rejected": -100.26654815673828,
"loss": 126373.0,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.006027103401720524,
"rewards/margins": -0.0006245746044442058,
"rewards/rejected": -0.005402528680860996,
"step": 120
},
{
"epoch": 0.416,
"grad_norm": 942915.0070681617,
"learning_rate": 3.25e-07,
"logits/chosen": -0.5311844348907471,
"logits/rejected": -0.5678432583808899,
"logps/chosen": -89.84095001220703,
"logps/rejected": -95.73307800292969,
"loss": 126546.9625,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.005261361598968506,
"rewards/margins": -0.00025905706570483744,
"rewards/rejected": -0.00500230398029089,
"step": 130
},
{
"epoch": 0.448,
"grad_norm": 802161.2678528542,
"learning_rate": 3.0714285714285716e-07,
"logits/chosen": -0.6184743642807007,
"logits/rejected": -0.6451131701469421,
"logps/chosen": -109.21659088134766,
"logps/rejected": -114.1061019897461,
"loss": 125730.125,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.011759540066123009,
"rewards/margins": 0.0014495229115709662,
"rewards/rejected": -0.01320906262844801,
"step": 140
},
{
"epoch": 0.48,
"grad_norm": 866428.7327389624,
"learning_rate": 2.892857142857143e-07,
"logits/chosen": -0.6030551195144653,
"logits/rejected": -0.5557407140731812,
"logps/chosen": -82.86506652832031,
"logps/rejected": -85.31071472167969,
"loss": 125425.025,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.010830635204911232,
"rewards/margins": -6.357554957503453e-05,
"rewards/rejected": -0.010767060332000256,
"step": 150
},
{
"epoch": 0.512,
"grad_norm": 743330.5276750317,
"learning_rate": 2.714285714285714e-07,
"logits/chosen": -0.5015612840652466,
"logits/rejected": -0.5147450566291809,
"logps/chosen": -82.76224517822266,
"logps/rejected": -91.91256713867188,
"loss": 124215.3,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.010083668865263462,
"rewards/margins": 0.0024900883436203003,
"rewards/rejected": -0.012573758140206337,
"step": 160
},
{
"epoch": 0.544,
"grad_norm": 863614.5495224567,
"learning_rate": 2.5357142857142855e-07,
"logits/chosen": -0.5797610878944397,
"logits/rejected": -0.5199266672134399,
"logps/chosen": -94.99356842041016,
"logps/rejected": -96.22293090820312,
"loss": 127004.7,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.012711484916508198,
"rewards/margins": 0.004797719419002533,
"rewards/rejected": -0.017509203404188156,
"step": 170
},
{
"epoch": 0.576,
"grad_norm": 831681.0077569862,
"learning_rate": 2.357142857142857e-07,
"logits/chosen": -0.6032494902610779,
"logits/rejected": -0.579995334148407,
"logps/chosen": -104.5300521850586,
"logps/rejected": -108.78277587890625,
"loss": 125979.4375,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.009229556657373905,
"rewards/margins": 0.004828121047466993,
"rewards/rejected": -0.014057678170502186,
"step": 180
},
{
"epoch": 0.608,
"grad_norm": 780274.1467706825,
"learning_rate": 2.1785714285714284e-07,
"logits/chosen": -0.7121313810348511,
"logits/rejected": -0.667202353477478,
"logps/chosen": -115.69401550292969,
"logps/rejected": -110.82621765136719,
"loss": 124809.7,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.012815780937671661,
"rewards/margins": -0.0001598205417394638,
"rewards/rejected": -0.012655961327254772,
"step": 190
},
{
"epoch": 0.64,
"grad_norm": 774598.0171325745,
"learning_rate": 2e-07,
"logits/chosen": -0.612346351146698,
"logits/rejected": -0.6116153001785278,
"logps/chosen": -91.24519348144531,
"logps/rejected": -97.00153350830078,
"loss": 123650.5375,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.01422748900949955,
"rewards/margins": 0.0018453721422702074,
"rewards/rejected": -0.01607285998761654,
"step": 200
},
{
"epoch": 0.672,
"grad_norm": 1137683.0365726806,
"learning_rate": 1.8214285714285714e-07,
"logits/chosen": -0.6241598725318909,
"logits/rejected": -0.6161590814590454,
"logps/chosen": -82.91732788085938,
"logps/rejected": -92.75973510742188,
"loss": 125116.0125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.01631699874997139,
"rewards/margins": 0.002573491772636771,
"rewards/rejected": -0.018890492618083954,
"step": 210
},
{
"epoch": 0.704,
"grad_norm": 921161.3498685773,
"learning_rate": 1.6428571428571429e-07,
"logits/chosen": -0.6814984083175659,
"logits/rejected": -0.6642488241195679,
"logps/chosen": -134.07284545898438,
"logps/rejected": -134.7923126220703,
"loss": 125720.675,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.01326170563697815,
"rewards/margins": 0.0025010218378156424,
"rewards/rejected": -0.015762727707624435,
"step": 220
},
{
"epoch": 0.736,
"grad_norm": 813896.4945325998,
"learning_rate": 1.4642857142857143e-07,
"logits/chosen": -0.5411783456802368,
"logits/rejected": -0.5778718590736389,
"logps/chosen": -104.65946197509766,
"logps/rejected": -107.73319244384766,
"loss": 125973.8125,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.012594607658684254,
"rewards/margins": 0.0011098148534074426,
"rewards/rejected": -0.013704421930015087,
"step": 230
},
{
"epoch": 0.768,
"grad_norm": 1031122.2282012746,
"learning_rate": 1.2857142857142855e-07,
"logits/chosen": -0.6678429841995239,
"logits/rejected": -0.6291283369064331,
"logps/chosen": -104.91682434082031,
"logps/rejected": -111.02679443359375,
"loss": 126001.475,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.008200669661164284,
"rewards/margins": 0.0015530238160863519,
"rewards/rejected": -0.009753693826496601,
"step": 240
},
{
"epoch": 0.8,
"grad_norm": 858633.8039080129,
"learning_rate": 1.107142857142857e-07,
"logits/chosen": -0.6295119524002075,
"logits/rejected": -0.6167672872543335,
"logps/chosen": -123.36985778808594,
"logps/rejected": -133.19418334960938,
"loss": 126223.65,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.011947548016905785,
"rewards/margins": 0.006852240767329931,
"rewards/rejected": -0.018799791112542152,
"step": 250
},
{
"epoch": 0.832,
"grad_norm": 951847.1640935472,
"learning_rate": 9.285714285714286e-08,
"logits/chosen": -0.6834455728530884,
"logits/rejected": -0.7226243615150452,
"logps/chosen": -86.39234924316406,
"logps/rejected": -95.36772155761719,
"loss": 124640.2,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.01843985728919506,
"rewards/margins": 0.003491448936983943,
"rewards/rejected": -0.021931307390332222,
"step": 260
},
{
"epoch": 0.864,
"grad_norm": 816825.5268517752,
"learning_rate": 7.5e-08,
"logits/chosen": -0.6084921956062317,
"logits/rejected": -0.606655478477478,
"logps/chosen": -95.06122589111328,
"logps/rejected": -100.9395523071289,
"loss": 126797.975,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.013821562752127647,
"rewards/margins": 0.0025993138551712036,
"rewards/rejected": -0.01642087660729885,
"step": 270
},
{
"epoch": 0.896,
"grad_norm": 823903.2164322428,
"learning_rate": 5.714285714285714e-08,
"logits/chosen": -0.7316595315933228,
"logits/rejected": -0.7817249298095703,
"logps/chosen": -97.38008880615234,
"logps/rejected": -122.05289459228516,
"loss": 122803.6375,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.012123498134315014,
"rewards/margins": 0.006116434000432491,
"rewards/rejected": -0.018239933997392654,
"step": 280
},
{
"epoch": 0.928,
"grad_norm": 1213103.129361221,
"learning_rate": 3.9285714285714285e-08,
"logits/chosen": -0.7132126092910767,
"logits/rejected": -0.7211403846740723,
"logps/chosen": -115.4140853881836,
"logps/rejected": -124.9251480102539,
"loss": 125220.8875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.008542357943952084,
"rewards/margins": 0.007235427852720022,
"rewards/rejected": -0.01577778533101082,
"step": 290
},
{
"epoch": 0.96,
"grad_norm": 826125.8509083999,
"learning_rate": 2.142857142857143e-08,
"logits/chosen": -0.4794866144657135,
"logits/rejected": -0.48627161979675293,
"logps/chosen": -106.44710540771484,
"logps/rejected": -113.4127197265625,
"loss": 124190.425,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.016785580664873123,
"rewards/margins": 0.002466305159032345,
"rewards/rejected": -0.019251886755228043,
"step": 300
},
{
"epoch": 0.992,
"grad_norm": 853168.6471782625,
"learning_rate": 3.571428571428571e-09,
"logits/chosen": -0.6391203999519348,
"logits/rejected": -0.6226745843887329,
"logps/chosen": -105.24736022949219,
"logps/rejected": -109.426025390625,
"loss": 122976.65,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.01043027639389038,
"rewards/margins": 0.003538835793733597,
"rewards/rejected": -0.013969110324978828,
"step": 310
},
{
"epoch": 0.9984,
"step": 312,
"total_flos": 0.0,
"train_loss": 125356.69771634616,
"train_runtime": 2759.785,
"train_samples_per_second": 7.245,
"train_steps_per_second": 0.113
}
],
"logging_steps": 10,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}