RyanYr's picture
Training in progress, step 160, checkpoint
9c11f40 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.998828811243412,
"eval_steps": 75,
"global_step": 160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024985360140542652,
"grad_norm": 16.803013672547674,
"learning_rate": 4e-09,
"logits/chosen": -0.7169057726860046,
"logits/rejected": -0.7742066979408264,
"logps/chosen": -158.30039978027344,
"logps/rejected": -167.5013427734375,
"loss": 0.6922,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.0021153492853045464,
"rewards/margins": 0.0017622699961066246,
"rewards/rejected": 0.00035307969665154815,
"step": 2
},
{
"epoch": 0.049970720281085304,
"grad_norm": 17.42934158508575,
"learning_rate": 8e-09,
"logits/chosen": -0.6620150804519653,
"logits/rejected": -0.7335376143455505,
"logps/chosen": -166.97694396972656,
"logps/rejected": -166.01077270507812,
"loss": 0.6934,
"rewards/accuracies": 0.54296875,
"rewards/chosen": 0.001203760621137917,
"rewards/margins": 0.0035094027407467365,
"rewards/rejected": -0.002305642468854785,
"step": 4
},
{
"epoch": 0.07495608042162795,
"grad_norm": 17.33795202273814,
"learning_rate": 1.1999999999999998e-08,
"logits/chosen": -0.7035447359085083,
"logits/rejected": -0.7770529985427856,
"logps/chosen": -160.94981384277344,
"logps/rejected": -169.4982147216797,
"loss": 0.6942,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0030440501868724823,
"rewards/margins": -0.003373978193849325,
"rewards/rejected": 0.006418028846383095,
"step": 6
},
{
"epoch": 0.09994144056217061,
"grad_norm": 16.8202461847326,
"learning_rate": 1.6e-08,
"logits/chosen": -0.6711893677711487,
"logits/rejected": -0.7459686994552612,
"logps/chosen": -164.15184020996094,
"logps/rejected": -180.4791259765625,
"loss": 0.6923,
"rewards/accuracies": 0.49609375,
"rewards/chosen": 0.0023907795548439026,
"rewards/margins": -0.002676962874829769,
"rewards/rejected": 0.005067741964012384,
"step": 8
},
{
"epoch": 0.12492680070271325,
"grad_norm": 17.3204666289138,
"learning_rate": 2e-08,
"logits/chosen": -0.6638763546943665,
"logits/rejected": -0.7240799069404602,
"logps/chosen": -165.18699645996094,
"logps/rejected": -153.55906677246094,
"loss": 0.6933,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.0023628827184438705,
"rewards/margins": 0.002067561261355877,
"rewards/rejected": 0.00029532192274928093,
"step": 10
},
{
"epoch": 0.1499121608432559,
"grad_norm": 18.418398156120897,
"learning_rate": 2.3999999999999997e-08,
"logits/chosen": -0.7016454935073853,
"logits/rejected": -0.7838542461395264,
"logps/chosen": -159.6986083984375,
"logps/rejected": -275.36236572265625,
"loss": 0.694,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -0.0019385786727070808,
"rewards/margins": 0.0008969469927251339,
"rewards/rejected": -0.0028355256654322147,
"step": 12
},
{
"epoch": 0.17489752098379854,
"grad_norm": 17.791533019243598,
"learning_rate": 2.8000000000000003e-08,
"logits/chosen": -0.6400444507598877,
"logits/rejected": -0.7088255882263184,
"logps/chosen": -159.34640502929688,
"logps/rejected": -162.47824096679688,
"loss": 0.6923,
"rewards/accuracies": 0.51953125,
"rewards/chosen": 0.0004528433782979846,
"rewards/margins": 0.0028284057043492794,
"rewards/rejected": -0.0023755626752972603,
"step": 14
},
{
"epoch": 0.19988288112434122,
"grad_norm": 17.319201876452443,
"learning_rate": 3.2e-08,
"logits/chosen": -0.6704908609390259,
"logits/rejected": -0.7314557433128357,
"logps/chosen": -160.60862731933594,
"logps/rejected": -166.46450805664062,
"loss": 0.6936,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.001064170734025538,
"rewards/margins": 3.689667209982872e-06,
"rewards/rejected": 0.0010604818817228079,
"step": 16
},
{
"epoch": 0.22486824126488386,
"grad_norm": 16.072247731283092,
"learning_rate": 3.6e-08,
"logits/chosen": -0.6766926050186157,
"logits/rejected": -0.7459310293197632,
"logps/chosen": -162.13914489746094,
"logps/rejected": -191.6351318359375,
"loss": 0.6931,
"rewards/accuracies": 0.50390625,
"rewards/chosen": 0.005069206468760967,
"rewards/margins": 0.0026566418819129467,
"rewards/rejected": 0.0024125645868480206,
"step": 18
},
{
"epoch": 0.2498536014054265,
"grad_norm": 17.359137433037688,
"learning_rate": 4e-08,
"logits/chosen": -0.6625803709030151,
"logits/rejected": -0.7203136682510376,
"logps/chosen": -156.5331573486328,
"logps/rejected": -222.60467529296875,
"loss": 0.6923,
"rewards/accuracies": 0.47265625,
"rewards/chosen": 0.001491243951022625,
"rewards/margins": -0.0024129198864102364,
"rewards/rejected": 0.0039041636046022177,
"step": 20
},
{
"epoch": 0.27483896154596915,
"grad_norm": 17.83000320149723,
"learning_rate": 4.4e-08,
"logits/chosen": -0.6529428958892822,
"logits/rejected": -0.7184248566627502,
"logps/chosen": -161.4114990234375,
"logps/rejected": -171.13998413085938,
"loss": 0.6933,
"rewards/accuracies": 0.54296875,
"rewards/chosen": 0.003907513804733753,
"rewards/margins": 0.0027447110041975975,
"rewards/rejected": 0.0011628026841208339,
"step": 22
},
{
"epoch": 0.2998243216865118,
"grad_norm": 18.22059882059828,
"learning_rate": 4.799999999999999e-08,
"logits/chosen": -0.6906304955482483,
"logits/rejected": -0.7680624723434448,
"logps/chosen": -171.14309692382812,
"logps/rejected": -251.1785888671875,
"loss": 0.6916,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.004641087260097265,
"rewards/margins": 0.0015279713552445173,
"rewards/rejected": 0.0031131161376833916,
"step": 24
},
{
"epoch": 0.32480968182705444,
"grad_norm": 16.628845467432836,
"learning_rate": 5.2e-08,
"logits/chosen": -0.6719599962234497,
"logits/rejected": -0.743903636932373,
"logps/chosen": -159.7659912109375,
"logps/rejected": -191.9639129638672,
"loss": 0.6898,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.004612984135746956,
"rewards/margins": 0.00576308136805892,
"rewards/rejected": -0.001150098629295826,
"step": 26
},
{
"epoch": 0.3497950419675971,
"grad_norm": 17.41091157106264,
"learning_rate": 5.6000000000000005e-08,
"logits/chosen": -0.6741428375244141,
"logits/rejected": -0.7584381699562073,
"logps/chosen": -162.17498779296875,
"logps/rejected": -213.79025268554688,
"loss": 0.6896,
"rewards/accuracies": 0.55859375,
"rewards/chosen": 0.011129561811685562,
"rewards/margins": 0.009353543631732464,
"rewards/rejected": 0.0017760168993845582,
"step": 28
},
{
"epoch": 0.3747804021081398,
"grad_norm": 17.981525024641783,
"learning_rate": 6e-08,
"logits/chosen": -0.6742160320281982,
"logits/rejected": -0.7543560266494751,
"logps/chosen": -164.4986114501953,
"logps/rejected": -202.91433715820312,
"loss": 0.6892,
"rewards/accuracies": 0.52734375,
"rewards/chosen": 0.006884717848151922,
"rewards/margins": 0.0076943556778132915,
"rewards/rejected": -0.000809638062492013,
"step": 30
},
{
"epoch": 0.39976576224868243,
"grad_norm": 17.109113741756868,
"learning_rate": 6.4e-08,
"logits/chosen": -0.6298938989639282,
"logits/rejected": -0.7012688517570496,
"logps/chosen": -160.2120361328125,
"logps/rejected": -160.2590789794922,
"loss": 0.6895,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.010141907259821892,
"rewards/margins": 0.007148087956011295,
"rewards/rejected": 0.00299381953664124,
"step": 32
},
{
"epoch": 0.4247511223892251,
"grad_norm": 17.727097440594363,
"learning_rate": 6.8e-08,
"logits/chosen": -0.6654178500175476,
"logits/rejected": -0.7475502490997314,
"logps/chosen": -174.74790954589844,
"logps/rejected": -165.39630126953125,
"loss": 0.6846,
"rewards/accuracies": 0.63671875,
"rewards/chosen": 0.022659441456198692,
"rewards/margins": 0.020327560603618622,
"rewards/rejected": 0.002331881085410714,
"step": 34
},
{
"epoch": 0.4497364825297677,
"grad_norm": 16.090260229163803,
"learning_rate": 7.2e-08,
"logits/chosen": -0.6315876841545105,
"logits/rejected": -0.7063596844673157,
"logps/chosen": -158.3714141845703,
"logps/rejected": -169.9803924560547,
"loss": 0.6823,
"rewards/accuracies": 0.62109375,
"rewards/chosen": 0.02803650312125683,
"rewards/margins": 0.024142108857631683,
"rewards/rejected": 0.0038943937979638577,
"step": 36
},
{
"epoch": 0.47472184267031037,
"grad_norm": 17.989654826552492,
"learning_rate": 7.599999999999999e-08,
"logits/chosen": -0.6781339645385742,
"logits/rejected": -0.749303936958313,
"logps/chosen": -158.1745147705078,
"logps/rejected": -174.4684295654297,
"loss": 0.6798,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.035830847918987274,
"rewards/margins": 0.030665559694170952,
"rewards/rejected": 0.005165286362171173,
"step": 38
},
{
"epoch": 0.499707202810853,
"grad_norm": 16.659252701258378,
"learning_rate": 8e-08,
"logits/chosen": -0.6752834916114807,
"logits/rejected": -0.7593508958816528,
"logps/chosen": -171.382568359375,
"logps/rejected": -163.63429260253906,
"loss": 0.6797,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.03702434524893761,
"rewards/margins": 0.027536926791071892,
"rewards/rejected": 0.009487415663897991,
"step": 40
},
{
"epoch": 0.5246925629513957,
"grad_norm": 16.32043656758089,
"learning_rate": 8.4e-08,
"logits/chosen": -0.6655137538909912,
"logits/rejected": -0.7505479454994202,
"logps/chosen": -169.40570068359375,
"logps/rejected": -204.10903930664062,
"loss": 0.6792,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.03623414784669876,
"rewards/margins": 0.026394760236144066,
"rewards/rejected": 0.00983938854187727,
"step": 42
},
{
"epoch": 0.5496779230919383,
"grad_norm": 15.622546817842467,
"learning_rate": 8.8e-08,
"logits/chosen": -0.6490427255630493,
"logits/rejected": -0.7285715937614441,
"logps/chosen": -156.1206817626953,
"logps/rejected": -163.50775146484375,
"loss": 0.678,
"rewards/accuracies": 0.68359375,
"rewards/chosen": 0.03998086601495743,
"rewards/margins": 0.03349429368972778,
"rewards/rejected": 0.006486575584858656,
"step": 44
},
{
"epoch": 0.574663283232481,
"grad_norm": 16.184465147770208,
"learning_rate": 9.2e-08,
"logits/chosen": -0.6619192361831665,
"logits/rejected": -0.7221629023551941,
"logps/chosen": -165.5501251220703,
"logps/rejected": -181.84275817871094,
"loss": 0.674,
"rewards/accuracies": 0.72265625,
"rewards/chosen": 0.054740943014621735,
"rewards/margins": 0.04307195544242859,
"rewards/rejected": 0.011668986640870571,
"step": 46
},
{
"epoch": 0.5996486433730236,
"grad_norm": 16.443566147600368,
"learning_rate": 9.599999999999999e-08,
"logits/chosen": -0.6748422384262085,
"logits/rejected": -0.7496626973152161,
"logps/chosen": -164.22450256347656,
"logps/rejected": -171.41998291015625,
"loss": 0.6607,
"rewards/accuracies": 0.75390625,
"rewards/chosen": 0.07237192243337631,
"rewards/margins": 0.06578801572322845,
"rewards/rejected": 0.006583897862583399,
"step": 48
},
{
"epoch": 0.6246340035135662,
"grad_norm": 15.12731079703187,
"learning_rate": 1e-07,
"logits/chosen": -0.6667000651359558,
"logits/rejected": -0.7247492074966431,
"logps/chosen": -152.322998046875,
"logps/rejected": -198.47271728515625,
"loss": 0.6565,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.08942899107933044,
"rewards/margins": 0.06775067746639252,
"rewards/rejected": 0.021678313612937927,
"step": 50
},
{
"epoch": 0.6496193636541089,
"grad_norm": 14.509532935330286,
"learning_rate": 1.04e-07,
"logits/chosen": -0.6652993559837341,
"logits/rejected": -0.761294960975647,
"logps/chosen": -161.75856018066406,
"logps/rejected": -202.80789184570312,
"loss": 0.6513,
"rewards/accuracies": 0.80859375,
"rewards/chosen": 0.10889974981546402,
"rewards/margins": 0.10048462450504303,
"rewards/rejected": 0.008415117859840393,
"step": 52
},
{
"epoch": 0.6746047237946515,
"grad_norm": 13.828860573682139,
"learning_rate": 1.08e-07,
"logits/chosen": -0.6644891500473022,
"logits/rejected": -0.7297866940498352,
"logps/chosen": -167.40745544433594,
"logps/rejected": -191.5254669189453,
"loss": 0.649,
"rewards/accuracies": 0.76953125,
"rewards/chosen": 0.10965421050786972,
"rewards/margins": 0.09407318383455276,
"rewards/rejected": 0.015581016428768635,
"step": 54
},
{
"epoch": 0.6995900839351942,
"grad_norm": 14.094010354912754,
"learning_rate": 1.1200000000000001e-07,
"logits/chosen": -0.6829609274864197,
"logits/rejected": -0.7518411874771118,
"logps/chosen": -161.49169921875,
"logps/rejected": -194.95101928710938,
"loss": 0.6448,
"rewards/accuracies": 0.75390625,
"rewards/chosen": 0.10309572517871857,
"rewards/margins": 0.10116783529520035,
"rewards/rejected": 0.0019278817344456911,
"step": 56
},
{
"epoch": 0.7245754440757368,
"grad_norm": 14.33157630919911,
"learning_rate": 1.1599999999999999e-07,
"logits/chosen": -0.678025484085083,
"logits/rejected": -0.7501699924468994,
"logps/chosen": -163.56793212890625,
"logps/rejected": -198.6992950439453,
"loss": 0.6439,
"rewards/accuracies": 0.77734375,
"rewards/chosen": 0.12135004997253418,
"rewards/margins": 0.11319853365421295,
"rewards/rejected": 0.008151513524353504,
"step": 58
},
{
"epoch": 0.7495608042162796,
"grad_norm": 14.250114975850236,
"learning_rate": 1.2e-07,
"logits/chosen": -0.6545270681381226,
"logits/rejected": -0.742324709892273,
"logps/chosen": -177.452880859375,
"logps/rejected": -257.645263671875,
"loss": 0.6364,
"rewards/accuracies": 0.74609375,
"rewards/chosen": 0.11762025952339172,
"rewards/margins": 0.10628640651702881,
"rewards/rejected": 0.011333855800330639,
"step": 60
},
{
"epoch": 0.7745461643568222,
"grad_norm": 13.35663678176419,
"learning_rate": 1.24e-07,
"logits/chosen": -0.6429523229598999,
"logits/rejected": -0.7005941867828369,
"logps/chosen": -160.99609375,
"logps/rejected": -158.8332061767578,
"loss": 0.6357,
"rewards/accuracies": 0.75390625,
"rewards/chosen": 0.11651378124952316,
"rewards/margins": 0.11703144758939743,
"rewards/rejected": -0.0005176601116545498,
"step": 62
},
{
"epoch": 0.7995315244973649,
"grad_norm": 13.344011017214148,
"learning_rate": 1.28e-07,
"logits/chosen": -0.671217143535614,
"logits/rejected": -0.7481105923652649,
"logps/chosen": -164.79576110839844,
"logps/rejected": -214.5727081298828,
"loss": 0.6272,
"rewards/accuracies": 0.79296875,
"rewards/chosen": 0.13732488453388214,
"rewards/margins": 0.14830084145069122,
"rewards/rejected": -0.010975953191518784,
"step": 64
},
{
"epoch": 0.8245168846379075,
"grad_norm": 11.603847422508784,
"learning_rate": 1.32e-07,
"logits/chosen": -0.6748225092887878,
"logits/rejected": -0.7619199752807617,
"logps/chosen": -165.9488525390625,
"logps/rejected": -259.8753967285156,
"loss": 0.615,
"rewards/accuracies": 0.76171875,
"rewards/chosen": 0.13191932439804077,
"rewards/margins": 0.1852141171693802,
"rewards/rejected": -0.05329480394721031,
"step": 66
},
{
"epoch": 0.8495022447784502,
"grad_norm": 11.310140143623643,
"learning_rate": 1.36e-07,
"logits/chosen": -0.6559648513793945,
"logits/rejected": -0.7262955904006958,
"logps/chosen": -162.48326110839844,
"logps/rejected": -168.70834350585938,
"loss": 0.5908,
"rewards/accuracies": 0.80078125,
"rewards/chosen": 0.15893952548503876,
"rewards/margins": 0.24133484065532684,
"rewards/rejected": -0.08239532262086868,
"step": 68
},
{
"epoch": 0.8744876049189928,
"grad_norm": 11.076590712903254,
"learning_rate": 1.3999999999999998e-07,
"logits/chosen": -0.64164799451828,
"logits/rejected": -0.6981642246246338,
"logps/chosen": -159.47991943359375,
"logps/rejected": -164.59423828125,
"loss": 0.588,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.11835081875324249,
"rewards/margins": 0.2690027356147766,
"rewards/rejected": -0.15065191686153412,
"step": 70
},
{
"epoch": 0.8994729650595354,
"grad_norm": 11.30376463380917,
"learning_rate": 1.44e-07,
"logits/chosen": -0.6537081003189087,
"logits/rejected": -0.7259843349456787,
"logps/chosen": -163.25912475585938,
"logps/rejected": -186.23190307617188,
"loss": 0.5899,
"rewards/accuracies": 0.73828125,
"rewards/chosen": 0.036043643951416016,
"rewards/margins": 0.22015729546546936,
"rewards/rejected": -0.18411365151405334,
"step": 72
},
{
"epoch": 0.9244583252000781,
"grad_norm": 11.470685518141812,
"learning_rate": 1.48e-07,
"logits/chosen": -0.6623800992965698,
"logits/rejected": -0.7280963063240051,
"logps/chosen": -163.83189392089844,
"logps/rejected": -162.69908142089844,
"loss": 0.5701,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.0784626230597496,
"rewards/margins": 0.33121681213378906,
"rewards/rejected": -0.25275421142578125,
"step": 74
},
{
"epoch": 0.9369510052703494,
"eval_logits/chosen": -0.6098010540008545,
"eval_logits/rejected": -0.6948941946029663,
"eval_logps/chosen": -174.5200653076172,
"eval_logps/rejected": -156.43321228027344,
"eval_loss": 0.5377179384231567,
"eval_rewards/accuracies": 0.8399999737739563,
"eval_rewards/chosen": 0.11009039729833603,
"eval_rewards/margins": 0.3738202750682831,
"eval_rewards/rejected": -0.26372990012168884,
"eval_runtime": 29.7619,
"eval_samples_per_second": 3.36,
"eval_steps_per_second": 0.84,
"step": 75
},
{
"epoch": 0.9494436853406207,
"grad_norm": 10.843228504877107,
"learning_rate": 1.5199999999999998e-07,
"logits/chosen": -0.6558808088302612,
"logits/rejected": -0.7365143299102783,
"logps/chosen": -167.81484985351562,
"logps/rejected": -233.72686767578125,
"loss": 0.5714,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.06528769433498383,
"rewards/margins": 0.31673550605773926,
"rewards/rejected": -0.2514478266239166,
"step": 76
},
{
"epoch": 0.9744290454811634,
"grad_norm": 11.247238407003374,
"learning_rate": 1.56e-07,
"logits/chosen": -0.6631561517715454,
"logits/rejected": -0.732117772102356,
"logps/chosen": -158.63388061523438,
"logps/rejected": -186.37835693359375,
"loss": 0.5622,
"rewards/accuracies": 0.80078125,
"rewards/chosen": 0.03322272002696991,
"rewards/margins": 0.3244516849517822,
"rewards/rejected": -0.2912289500236511,
"step": 78
},
{
"epoch": 0.999414405621706,
"grad_norm": 10.873879267366776,
"learning_rate": 1.6e-07,
"logits/chosen": -0.678787887096405,
"logits/rejected": -0.762289822101593,
"logps/chosen": -173.52723693847656,
"logps/rejected": -212.18414306640625,
"loss": 0.5644,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.04449426010251045,
"rewards/margins": 0.29196038842201233,
"rewards/rejected": -0.24746613204479218,
"step": 80
},
{
"epoch": 1.0243997657622488,
"grad_norm": 10.813522808672303,
"learning_rate": 1.6399999999999999e-07,
"logits/chosen": -0.6670259833335876,
"logits/rejected": -0.731939971446991,
"logps/chosen": -169.05836486816406,
"logps/rejected": -183.41171264648438,
"loss": 0.5481,
"rewards/accuracies": 0.8203125,
"rewards/chosen": 0.09290473908185959,
"rewards/margins": 0.3946535289287567,
"rewards/rejected": -0.3017488121986389,
"step": 82
},
{
"epoch": 1.0493851259027913,
"grad_norm": 10.85312267043124,
"learning_rate": 1.68e-07,
"logits/chosen": -0.6822367310523987,
"logits/rejected": -0.7420221567153931,
"logps/chosen": -168.18081665039062,
"logps/rejected": -175.647705078125,
"loss": 0.5433,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.017166346311569214,
"rewards/margins": 0.35215744376182556,
"rewards/rejected": -0.33499109745025635,
"step": 84
},
{
"epoch": 1.074370486043334,
"grad_norm": 10.234969231047268,
"learning_rate": 1.7199999999999998e-07,
"logits/chosen": -0.6367188096046448,
"logits/rejected": -0.7021892666816711,
"logps/chosen": -168.204345703125,
"logps/rejected": -172.60887145996094,
"loss": 0.5389,
"rewards/accuracies": 0.78515625,
"rewards/chosen": 0.031016340479254723,
"rewards/margins": 0.4050399959087372,
"rewards/rejected": -0.3740236461162567,
"step": 86
},
{
"epoch": 1.0993558461838766,
"grad_norm": 10.400074349909037,
"learning_rate": 1.76e-07,
"logits/chosen": -0.6521391272544861,
"logits/rejected": -0.7159854769706726,
"logps/chosen": -165.23777770996094,
"logps/rejected": -179.17791748046875,
"loss": 0.5255,
"rewards/accuracies": 0.80859375,
"rewards/chosen": 0.010984277352690697,
"rewards/margins": 0.4423283338546753,
"rewards/rejected": -0.43134409189224243,
"step": 88
},
{
"epoch": 1.1243412063244194,
"grad_norm": 10.94221462203906,
"learning_rate": 1.8e-07,
"logits/chosen": -0.6484578847885132,
"logits/rejected": -0.7167034149169922,
"logps/chosen": -164.0042724609375,
"logps/rejected": -189.57470703125,
"loss": 0.4977,
"rewards/accuracies": 0.80859375,
"rewards/chosen": 0.044118743389844894,
"rewards/margins": 0.5543583035469055,
"rewards/rejected": -0.5102395415306091,
"step": 90
},
{
"epoch": 1.149326566464962,
"grad_norm": 10.061790706384036,
"learning_rate": 1.84e-07,
"logits/chosen": -0.6450331211090088,
"logits/rejected": -0.6989036798477173,
"logps/chosen": -161.26258850097656,
"logps/rejected": -176.26287841796875,
"loss": 0.4822,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -0.09776711463928223,
"rewards/margins": 0.5529555678367615,
"rewards/rejected": -0.6507226824760437,
"step": 92
},
{
"epoch": 1.1743119266055047,
"grad_norm": 10.181525940489752,
"learning_rate": 1.88e-07,
"logits/chosen": -0.6763854026794434,
"logits/rejected": -0.7379953861236572,
"logps/chosen": -162.50949096679688,
"logps/rejected": -195.75962829589844,
"loss": 0.467,
"rewards/accuracies": 0.84765625,
"rewards/chosen": -0.21141284704208374,
"rewards/margins": 0.6165250539779663,
"rewards/rejected": -0.8279378414154053,
"step": 94
},
{
"epoch": 1.1992972867460472,
"grad_norm": 9.59098495871882,
"learning_rate": 1.9199999999999997e-07,
"logits/chosen": -0.6583154201507568,
"logits/rejected": -0.7248339653015137,
"logps/chosen": -171.67164611816406,
"logps/rejected": -204.2442626953125,
"loss": 0.4525,
"rewards/accuracies": 0.83203125,
"rewards/chosen": -0.22590558230876923,
"rewards/margins": 0.7413816452026367,
"rewards/rejected": -0.9672871828079224,
"step": 96
},
{
"epoch": 1.22428264688659,
"grad_norm": 9.611151986852143,
"learning_rate": 1.9599999999999998e-07,
"logits/chosen": -0.6739534139633179,
"logits/rejected": -0.7209540605545044,
"logps/chosen": -165.88185119628906,
"logps/rejected": -198.14913940429688,
"loss": 0.4362,
"rewards/accuracies": 0.83984375,
"rewards/chosen": -0.4009418785572052,
"rewards/margins": 0.8526190519332886,
"rewards/rejected": -1.2535607814788818,
"step": 98
},
{
"epoch": 1.2492680070271325,
"grad_norm": 9.635547492152954,
"learning_rate": 2e-07,
"logits/chosen": -0.6529893279075623,
"logits/rejected": -0.7117218971252441,
"logps/chosen": -167.87130737304688,
"logps/rejected": -199.54925537109375,
"loss": 0.4253,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.5539758205413818,
"rewards/margins": 0.8746498823165894,
"rewards/rejected": -1.4286257028579712,
"step": 100
},
{
"epoch": 1.2742533671676752,
"grad_norm": 9.323014566726187,
"learning_rate": 1.9945218953682733e-07,
"logits/chosen": -0.6634210348129272,
"logits/rejected": -0.7515499591827393,
"logps/chosen": -179.8871307373047,
"logps/rejected": -211.56712341308594,
"loss": 0.4162,
"rewards/accuracies": 0.8828125,
"rewards/chosen": -0.6504544615745544,
"rewards/margins": 0.9477463364601135,
"rewards/rejected": -1.598200798034668,
"step": 102
},
{
"epoch": 1.2992387273082178,
"grad_norm": 9.782518483212584,
"learning_rate": 1.9781476007338056e-07,
"logits/chosen": -0.6890003681182861,
"logits/rejected": -0.7631358504295349,
"logps/chosen": -178.91226196289062,
"logps/rejected": -221.65196228027344,
"loss": 0.4111,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.8466004729270935,
"rewards/margins": 0.9782698154449463,
"rewards/rejected": -1.8248703479766846,
"step": 104
},
{
"epoch": 1.3242240874487605,
"grad_norm": 9.676632000972155,
"learning_rate": 1.9510565162951537e-07,
"logits/chosen": -0.6939510703086853,
"logits/rejected": -0.7790961861610413,
"logps/chosen": -175.7962188720703,
"logps/rejected": -225.33914184570312,
"loss": 0.3898,
"rewards/accuracies": 0.85546875,
"rewards/chosen": -0.8194781541824341,
"rewards/margins": 1.1897538900375366,
"rewards/rejected": -2.0092320442199707,
"step": 106
},
{
"epoch": 1.349209447589303,
"grad_norm": 9.450466029531317,
"learning_rate": 1.9135454576426007e-07,
"logits/chosen": -0.6339809894561768,
"logits/rejected": -0.7058581113815308,
"logps/chosen": -176.75390625,
"logps/rejected": -183.5647430419922,
"loss": 0.3804,
"rewards/accuracies": 0.83984375,
"rewards/chosen": -0.9991697072982788,
"rewards/margins": 1.0066075325012207,
"rewards/rejected": -2.00577712059021,
"step": 108
},
{
"epoch": 1.3741948077298458,
"grad_norm": 9.045883620582659,
"learning_rate": 1.8660254037844388e-07,
"logits/chosen": -0.6670467853546143,
"logits/rejected": -0.7262380123138428,
"logps/chosen": -170.4173126220703,
"logps/rejected": -232.4281005859375,
"loss": 0.3591,
"rewards/accuracies": 0.89453125,
"rewards/chosen": -0.9117798805236816,
"rewards/margins": 1.2910921573638916,
"rewards/rejected": -2.2028720378875732,
"step": 110
},
{
"epoch": 1.3991801678703886,
"grad_norm": 8.752793412837908,
"learning_rate": 1.8090169943749475e-07,
"logits/chosen": -0.66654372215271,
"logits/rejected": -0.738584041595459,
"logps/chosen": -168.66757202148438,
"logps/rejected": -189.63882446289062,
"loss": 0.3534,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -0.9708907604217529,
"rewards/margins": 1.3288507461547852,
"rewards/rejected": -2.299741506576538,
"step": 112
},
{
"epoch": 1.424165528010931,
"grad_norm": 8.754108546199273,
"learning_rate": 1.7431448254773942e-07,
"logits/chosen": -0.6560633778572083,
"logits/rejected": -0.7114984393119812,
"logps/chosen": -163.83883666992188,
"logps/rejected": -181.41429138183594,
"loss": 0.3265,
"rewards/accuracies": 0.86328125,
"rewards/chosen": -1.0229204893112183,
"rewards/margins": 1.4013088941574097,
"rewards/rejected": -2.424229383468628,
"step": 114
},
{
"epoch": 1.4491508881514736,
"grad_norm": 8.842470960325368,
"learning_rate": 1.669130606358858e-07,
"logits/chosen": -0.6747975945472717,
"logits/rejected": -0.7556227445602417,
"logps/chosen": -177.47671508789062,
"logps/rejected": -229.03138732910156,
"loss": 0.3496,
"rewards/accuracies": 0.84765625,
"rewards/chosen": -1.2031465768814087,
"rewards/margins": 1.2502222061157227,
"rewards/rejected": -2.453368663787842,
"step": 116
},
{
"epoch": 1.4741362482920164,
"grad_norm": 8.560944140870841,
"learning_rate": 1.5877852522924732e-07,
"logits/chosen": -0.6661523580551147,
"logits/rejected": -0.7374821305274963,
"logps/chosen": -179.45278930664062,
"logps/rejected": -200.83184814453125,
"loss": 0.3463,
"rewards/accuracies": 0.83984375,
"rewards/chosen": -1.274424433708191,
"rewards/margins": 1.3644013404846191,
"rewards/rejected": -2.6388256549835205,
"step": 118
},
{
"epoch": 1.4991216084325591,
"grad_norm": 8.135071032264696,
"learning_rate": 1.5e-07,
"logits/chosen": -0.6814154982566833,
"logits/rejected": -0.7541234493255615,
"logps/chosen": -176.5241241455078,
"logps/rejected": -178.12158203125,
"loss": 0.3172,
"rewards/accuracies": 0.8671875,
"rewards/chosen": -1.236649513244629,
"rewards/margins": 1.4433623552322388,
"rewards/rejected": -2.680011749267578,
"step": 120
},
{
"epoch": 1.5241069685731017,
"grad_norm": 8.611381059926462,
"learning_rate": 1.4067366430758004e-07,
"logits/chosen": -0.7052810192108154,
"logits/rejected": -0.7841841578483582,
"logps/chosen": -176.52615356445312,
"logps/rejected": -265.2866516113281,
"loss": 0.3464,
"rewards/accuracies": 0.88671875,
"rewards/chosen": -1.2547199726104736,
"rewards/margins": 1.5170029401779175,
"rewards/rejected": -2.7717230319976807,
"step": 122
},
{
"epoch": 1.5490923287136442,
"grad_norm": 8.252873140772632,
"learning_rate": 1.3090169943749475e-07,
"logits/chosen": -0.6520602107048035,
"logits/rejected": -0.7327940464019775,
"logps/chosen": -176.3183135986328,
"logps/rejected": -229.5253448486328,
"loss": 0.3087,
"rewards/accuracies": 0.88671875,
"rewards/chosen": -1.2032685279846191,
"rewards/margins": 1.7557697296142578,
"rewards/rejected": -2.959038257598877,
"step": 124
},
{
"epoch": 1.574077688854187,
"grad_norm": 10.055814181219855,
"learning_rate": 1.207911690817759e-07,
"logits/chosen": -0.6678023338317871,
"logits/rejected": -0.7291412949562073,
"logps/chosen": -170.61351013183594,
"logps/rejected": -191.97714233398438,
"loss": 0.3451,
"rewards/accuracies": 0.88671875,
"rewards/chosen": -1.2367451190948486,
"rewards/margins": 1.7602653503417969,
"rewards/rejected": -2.9970104694366455,
"step": 126
},
{
"epoch": 1.5990630489947297,
"grad_norm": 7.755494267888693,
"learning_rate": 1.1045284632676535e-07,
"logits/chosen": -0.6732159852981567,
"logits/rejected": -0.7458621263504028,
"logps/chosen": -179.55528259277344,
"logps/rejected": -194.04571533203125,
"loss": 0.3013,
"rewards/accuracies": 0.9140625,
"rewards/chosen": -1.2336257696151733,
"rewards/margins": 1.7932240962982178,
"rewards/rejected": -3.0268499851226807,
"step": 128
},
{
"epoch": 1.6240484091352723,
"grad_norm": 8.333802703761862,
"learning_rate": 1e-07,
"logits/chosen": -0.6721549034118652,
"logits/rejected": -0.7458239793777466,
"logps/chosen": -182.87872314453125,
"logps/rejected": -235.57015991210938,
"loss": 0.3049,
"rewards/accuracies": 0.89453125,
"rewards/chosen": -1.2039211988449097,
"rewards/margins": 1.8136268854141235,
"rewards/rejected": -3.017548084259033,
"step": 130
},
{
"epoch": 1.6490337692758148,
"grad_norm": 8.273848396450413,
"learning_rate": 8.954715367323466e-08,
"logits/chosen": -0.693534255027771,
"logits/rejected": -0.7786884307861328,
"logps/chosen": -180.05812072753906,
"logps/rejected": -233.4104461669922,
"loss": 0.2977,
"rewards/accuracies": 0.92578125,
"rewards/chosen": -1.2325382232666016,
"rewards/margins": 1.9344900846481323,
"rewards/rejected": -3.1670281887054443,
"step": 132
},
{
"epoch": 1.6740191294163576,
"grad_norm": 7.810872252646391,
"learning_rate": 7.920883091822408e-08,
"logits/chosen": -0.6568552255630493,
"logits/rejected": -0.7265664935112,
"logps/chosen": -176.93911743164062,
"logps/rejected": -214.8884735107422,
"loss": 0.2827,
"rewards/accuracies": 0.8984375,
"rewards/chosen": -1.2643663883209229,
"rewards/margins": 1.8012300729751587,
"rewards/rejected": -3.065596580505371,
"step": 134
},
{
"epoch": 1.6990044895569003,
"grad_norm": 7.802511130852569,
"learning_rate": 6.909830056250527e-08,
"logits/chosen": -0.691701352596283,
"logits/rejected": -0.7584172487258911,
"logps/chosen": -180.76043701171875,
"logps/rejected": -201.7536163330078,
"loss": 0.2923,
"rewards/accuracies": 0.8828125,
"rewards/chosen": -1.3751585483551025,
"rewards/margins": 1.8911828994750977,
"rewards/rejected": -3.2663414478302,
"step": 136
},
{
"epoch": 1.723989849697443,
"grad_norm": 7.661437671496506,
"learning_rate": 5.9326335692419996e-08,
"logits/chosen": -0.691138744354248,
"logits/rejected": -0.7739748954772949,
"logps/chosen": -179.13803100585938,
"logps/rejected": -230.07017517089844,
"loss": 0.2703,
"rewards/accuracies": 0.9140625,
"rewards/chosen": -1.3471490144729614,
"rewards/margins": 2.1134843826293945,
"rewards/rejected": -3.4606332778930664,
"step": 138
},
{
"epoch": 1.7489752098379856,
"grad_norm": 7.425320729921027,
"learning_rate": 5.000000000000002e-08,
"logits/chosen": -0.6808772087097168,
"logits/rejected": -0.7619104385375977,
"logps/chosen": -178.50820922851562,
"logps/rejected": -221.21847534179688,
"loss": 0.2903,
"rewards/accuracies": 0.8828125,
"rewards/chosen": -1.328997015953064,
"rewards/margins": 1.9231306314468384,
"rewards/rejected": -3.2521276473999023,
"step": 140
},
{
"epoch": 1.7739605699785281,
"grad_norm": 7.517073157187585,
"learning_rate": 4.1221474770752695e-08,
"logits/chosen": -0.6579867005348206,
"logits/rejected": -0.7242329716682434,
"logps/chosen": -174.0385284423828,
"logps/rejected": -226.00914001464844,
"loss": 0.2703,
"rewards/accuracies": 0.9140625,
"rewards/chosen": -1.3584879636764526,
"rewards/margins": 1.9965893030166626,
"rewards/rejected": -3.3550772666931152,
"step": 142
},
{
"epoch": 1.798945930119071,
"grad_norm": 7.848642238408611,
"learning_rate": 3.3086939364114206e-08,
"logits/chosen": -0.6827540397644043,
"logits/rejected": -0.7463814616203308,
"logps/chosen": -181.77685546875,
"logps/rejected": -202.3132781982422,
"loss": 0.2875,
"rewards/accuracies": 0.9140625,
"rewards/chosen": -1.3786863088607788,
"rewards/margins": 1.8975354433059692,
"rewards/rejected": -3.276221990585327,
"step": 144
},
{
"epoch": 1.8239312902596136,
"grad_norm": 7.224249615030409,
"learning_rate": 2.5685517452260564e-08,
"logits/chosen": -0.6343103647232056,
"logits/rejected": -0.7141076326370239,
"logps/chosen": -190.22946166992188,
"logps/rejected": -238.4442901611328,
"loss": 0.2742,
"rewards/accuracies": 0.90625,
"rewards/chosen": -1.3857475519180298,
"rewards/margins": 2.068246841430664,
"rewards/rejected": -3.4539945125579834,
"step": 146
},
{
"epoch": 1.8489166504001562,
"grad_norm": 7.7461193794138135,
"learning_rate": 1.9098300562505266e-08,
"logits/chosen": -0.6793495416641235,
"logits/rejected": -0.7641343474388123,
"logps/chosen": -187.63839721679688,
"logps/rejected": -191.110107421875,
"loss": 0.2835,
"rewards/accuracies": 0.890625,
"rewards/chosen": -1.504347801208496,
"rewards/margins": 1.7995954751968384,
"rewards/rejected": -3.303943395614624,
"step": 148
},
{
"epoch": 1.8739020105406987,
"grad_norm": 7.689771419387666,
"learning_rate": 1.3397459621556128e-08,
"logits/chosen": -0.6912616491317749,
"logits/rejected": -0.7680445313453674,
"logps/chosen": -182.9419708251953,
"logps/rejected": -200.2938690185547,
"loss": 0.2655,
"rewards/accuracies": 0.9296875,
"rewards/chosen": -1.4619263410568237,
"rewards/margins": 2.098031759262085,
"rewards/rejected": -3.559957981109619,
"step": 150
},
{
"epoch": 1.8739020105406987,
"eval_logits/chosen": -0.6168845891952515,
"eval_logits/rejected": -0.7024461627006531,
"eval_logps/chosen": -189.5503692626953,
"eval_logps/rejected": -188.0495147705078,
"eval_loss": 0.26570188999176025,
"eval_rewards/accuracies": 0.8399999737739563,
"eval_rewards/chosen": -1.3929405212402344,
"eval_rewards/margins": 2.0324180126190186,
"eval_rewards/rejected": -3.425358533859253,
"eval_runtime": 29.4565,
"eval_samples_per_second": 3.395,
"eval_steps_per_second": 0.849,
"step": 150
},
{
"epoch": 1.8988873706812415,
"grad_norm": 8.195993031555705,
"learning_rate": 8.645454235739902e-09,
"logits/chosen": -0.676539957523346,
"logits/rejected": -0.7394694685935974,
"logps/chosen": -181.1811065673828,
"logps/rejected": -207.39080810546875,
"loss": 0.2738,
"rewards/accuracies": 0.921875,
"rewards/chosen": -1.422343373298645,
"rewards/margins": 1.9836503267288208,
"rewards/rejected": -3.405993700027466,
"step": 152
},
{
"epoch": 1.9238727308217842,
"grad_norm": 7.242463445087286,
"learning_rate": 4.8943483704846465e-09,
"logits/chosen": -0.656994104385376,
"logits/rejected": -0.7103748321533203,
"logps/chosen": -183.3748321533203,
"logps/rejected": -188.81219482421875,
"loss": 0.2718,
"rewards/accuracies": 0.89453125,
"rewards/chosen": -1.4463797807693481,
"rewards/margins": 1.880941390991211,
"rewards/rejected": -3.3273210525512695,
"step": 154
},
{
"epoch": 1.9488580909623268,
"grad_norm": 6.9367447852277575,
"learning_rate": 2.1852399266194312e-09,
"logits/chosen": -0.6909129619598389,
"logits/rejected": -0.7760818004608154,
"logps/chosen": -177.80374145507812,
"logps/rejected": -266.32806396484375,
"loss": 0.2686,
"rewards/accuracies": 0.92578125,
"rewards/chosen": -1.4405275583267212,
"rewards/margins": 2.0439579486846924,
"rewards/rejected": -3.484485626220703,
"step": 156
},
{
"epoch": 1.9738434511028693,
"grad_norm": 7.621192478848495,
"learning_rate": 5.47810463172671e-10,
"logits/chosen": -0.6629250049591064,
"logits/rejected": -0.7373142242431641,
"logps/chosen": -183.51670837402344,
"logps/rejected": -190.675537109375,
"loss": 0.2845,
"rewards/accuracies": 0.921875,
"rewards/chosen": -1.451604962348938,
"rewards/margins": 2.0776655673980713,
"rewards/rejected": -3.5292704105377197,
"step": 158
},
{
"epoch": 1.998828811243412,
"grad_norm": 7.292176844709846,
"learning_rate": 0.0,
"logits/chosen": -0.745610773563385,
"logits/rejected": -0.8196827173233032,
"logps/chosen": -172.93553161621094,
"logps/rejected": -271.3219299316406,
"loss": 0.264,
"rewards/accuracies": 0.9140625,
"rewards/chosen": -1.2879290580749512,
"rewards/margins": 2.1956043243408203,
"rewards/rejected": -3.4835333824157715,
"step": 160
}
],
"logging_steps": 2,
"max_steps": 160,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}