approx_nash_maxmin_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
abd58f2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 12.825809580615244,
"learning_rate": 3.125e-08,
"logits/chosen": -2.23366379737854,
"logits/rejected": -2.0032992362976074,
"logps/chosen": -196.23782348632812,
"logps/rejected": -174.6262969970703,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"grad_norm": 17.370258695749197,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.343287944793701,
"logits/rejected": -2.2950587272644043,
"logps/chosen": -179.1259002685547,
"logps/rejected": -178.35891723632812,
"loss": 0.6927,
"rewards/accuracies": 0.4145299196243286,
"rewards/chosen": -0.001734515419229865,
"rewards/margins": 0.001407344127073884,
"rewards/rejected": -0.003141859546303749,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 15.379940853087401,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -2.3157341480255127,
"logits/rejected": -2.2556896209716797,
"logps/chosen": -182.99658203125,
"logps/rejected": -181.58053588867188,
"loss": 0.691,
"rewards/accuracies": 0.4961538314819336,
"rewards/chosen": -0.12090444564819336,
"rewards/margins": 0.01673085428774357,
"rewards/rejected": -0.13763530552387238,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 16.658033176727283,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": -2.240647077560425,
"logits/rejected": -2.139284133911133,
"logps/chosen": -191.006591796875,
"logps/rejected": -186.6674041748047,
"loss": 0.6929,
"rewards/accuracies": 0.4961538314819336,
"rewards/chosen": -0.23701129853725433,
"rewards/margins": -5.6074215535772964e-05,
"rewards/rejected": -0.23695524036884308,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 13.482187340190304,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": -2.2772746086120605,
"logits/rejected": -2.310314893722534,
"logps/chosen": -174.7858428955078,
"logps/rejected": -184.82554626464844,
"loss": 0.6912,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.03568955510854721,
"rewards/margins": 0.0036310250870883465,
"rewards/rejected": -0.03932058438658714,
"step": 40
},
{
"epoch": 0.33,
"grad_norm": 17.865521715649642,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": -2.2564620971679688,
"logits/rejected": -2.278386116027832,
"logps/chosen": -192.4917755126953,
"logps/rejected": -193.97947692871094,
"loss": 0.6886,
"rewards/accuracies": 0.48076921701431274,
"rewards/chosen": -0.042664218693971634,
"rewards/margins": 0.010273917578160763,
"rewards/rejected": -0.05293813720345497,
"step": 50
},
{
"epoch": 0.39,
"grad_norm": 22.19136114113986,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": -2.145270824432373,
"logits/rejected": -2.073192596435547,
"logps/chosen": -224.1717529296875,
"logps/rejected": -225.2917022705078,
"loss": 0.6932,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.3118091821670532,
"rewards/margins": 0.0030472425278276205,
"rewards/rejected": -0.3148564398288727,
"step": 60
},
{
"epoch": 0.46,
"grad_norm": 15.131244681651243,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": -1.976121187210083,
"logits/rejected": -1.939582347869873,
"logps/chosen": -199.67727661132812,
"logps/rejected": -203.1947021484375,
"loss": 0.6892,
"rewards/accuracies": 0.5807692408561707,
"rewards/chosen": -0.179255411028862,
"rewards/margins": 0.009526830166578293,
"rewards/rejected": -0.1887822449207306,
"step": 70
},
{
"epoch": 0.52,
"grad_norm": 16.3790838983982,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": -2.0032222270965576,
"logits/rejected": -1.8593822717666626,
"logps/chosen": -202.15196228027344,
"logps/rejected": -215.4062042236328,
"loss": 0.6848,
"rewards/accuracies": 0.5846154093742371,
"rewards/chosen": -0.20821429789066315,
"rewards/margins": 0.04003766551613808,
"rewards/rejected": -0.24825195968151093,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 20.64519176920963,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": -1.5563592910766602,
"logits/rejected": -1.8040456771850586,
"logps/chosen": -219.68856811523438,
"logps/rejected": -231.39486694335938,
"loss": 0.6812,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.41656777262687683,
"rewards/margins": 0.060058608651161194,
"rewards/rejected": -0.47662636637687683,
"step": 90
},
{
"epoch": 0.65,
"grad_norm": 20.02675570793956,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": -1.4402265548706055,
"logits/rejected": -1.5615330934524536,
"logps/chosen": -229.6835174560547,
"logps/rejected": -234.88693237304688,
"loss": 0.6791,
"rewards/accuracies": 0.5076923370361328,
"rewards/chosen": -0.511164665222168,
"rewards/margins": 0.028713112697005272,
"rewards/rejected": -0.5398777723312378,
"step": 100
},
{
"epoch": 0.71,
"grad_norm": 17.88209150198185,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": -1.8932424783706665,
"logits/rejected": -1.7129985094070435,
"logps/chosen": -198.45960998535156,
"logps/rejected": -198.94027709960938,
"loss": 0.6797,
"rewards/accuracies": 0.6230769157409668,
"rewards/chosen": -0.1736122965812683,
"rewards/margins": 0.07991237938404083,
"rewards/rejected": -0.25352466106414795,
"step": 110
},
{
"epoch": 0.78,
"grad_norm": 18.1137509515123,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": -1.6426665782928467,
"logits/rejected": -1.8510866165161133,
"logps/chosen": -188.033203125,
"logps/rejected": -202.1238250732422,
"loss": 0.6814,
"rewards/accuracies": 0.5769230723381042,
"rewards/chosen": -0.1463213562965393,
"rewards/margins": 0.04249217361211777,
"rewards/rejected": -0.18881353735923767,
"step": 120
},
{
"epoch": 0.84,
"grad_norm": 19.900932973122792,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": -1.7662078142166138,
"logits/rejected": -1.7134820222854614,
"logps/chosen": -197.23963928222656,
"logps/rejected": -202.8521270751953,
"loss": 0.6918,
"rewards/accuracies": 0.5653846263885498,
"rewards/chosen": -0.2858903706073761,
"rewards/margins": 0.0338900126516819,
"rewards/rejected": -0.3197803497314453,
"step": 130
},
{
"epoch": 0.91,
"grad_norm": 19.787769013168933,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": -1.6619917154312134,
"logits/rejected": -1.7303296327590942,
"logps/chosen": -194.7332305908203,
"logps/rejected": -205.8466796875,
"loss": 0.68,
"rewards/accuracies": 0.6230769157409668,
"rewards/chosen": -0.21614637970924377,
"rewards/margins": 0.07540787756443024,
"rewards/rejected": -0.2915542721748352,
"step": 140
},
{
"epoch": 0.97,
"grad_norm": 19.04982739845647,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": -1.928228497505188,
"logits/rejected": -1.5072005987167358,
"logps/chosen": -211.20840454101562,
"logps/rejected": -211.61215209960938,
"loss": 0.6772,
"rewards/accuracies": 0.5884615182876587,
"rewards/chosen": -0.28447794914245605,
"rewards/margins": 0.0762421116232872,
"rewards/rejected": -0.36072006821632385,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.6859961845516379,
"train_runtime": 39900.7608,
"train_samples_per_second": 0.501,
"train_steps_per_second": 0.004
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}