zephyr-7b-dpo-full / trainer_state.json
RikkiXu's picture
Model save
8209ca4 verified
raw
history blame
No virus
27.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 27.37984871419997,
"learning_rate": 1.0416666666666666e-08,
"logits/chosen": -1.8783892393112183,
"logits/rejected": -1.8756425380706787,
"logps/chosen": -298.4870300292969,
"logps/rejected": -398.0157165527344,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 25.334426597070937,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -1.750243067741394,
"logits/rejected": -1.7067593336105347,
"logps/chosen": -280.5216369628906,
"logps/rejected": -271.8791809082031,
"loss": 0.6932,
"rewards/accuracies": 0.4236111044883728,
"rewards/chosen": -0.00042370916344225407,
"rewards/margins": -0.0002716032031457871,
"rewards/rejected": -0.00015210600395221263,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 23.205563002993117,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -1.8309519290924072,
"logits/rejected": -1.7239341735839844,
"logps/chosen": -298.9266662597656,
"logps/rejected": -320.81036376953125,
"loss": 0.6919,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.00880073755979538,
"rewards/margins": 0.0003546981024555862,
"rewards/rejected": 0.008446039631962776,
"step": 20
},
{
"epoch": 0.06,
"grad_norm": 22.833130746886702,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.8621642589569092,
"logits/rejected": -1.811255693435669,
"logps/chosen": -315.0081481933594,
"logps/rejected": -281.7824401855469,
"loss": 0.6846,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.06436704099178314,
"rewards/margins": 0.02108323760330677,
"rewards/rejected": 0.04328380152583122,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 20.296209907433,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -1.7256094217300415,
"logits/rejected": -1.6898906230926514,
"logps/chosen": -269.07220458984375,
"logps/rejected": -258.07366943359375,
"loss": 0.6708,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.15135471522808075,
"rewards/margins": 0.05834723263978958,
"rewards/rejected": 0.09300748258829117,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 18.992519669533575,
"learning_rate": 4.999733114418725e-07,
"logits/chosen": -1.7586348056793213,
"logits/rejected": -1.7471107244491577,
"logps/chosen": -274.77728271484375,
"logps/rejected": -298.24298095703125,
"loss": 0.6568,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.2238006889820099,
"rewards/margins": 0.05361497402191162,
"rewards/rejected": 0.17018567025661469,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 22.488749510223712,
"learning_rate": 4.990398100856366e-07,
"logits/chosen": -1.8446115255355835,
"logits/rejected": -1.8052647113800049,
"logps/chosen": -268.59100341796875,
"logps/rejected": -318.24041748046875,
"loss": 0.642,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.22674357891082764,
"rewards/margins": 0.11847379058599472,
"rewards/rejected": 0.10826978832483292,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 24.241452630651324,
"learning_rate": 4.967775735898179e-07,
"logits/chosen": -1.6720319986343384,
"logits/rejected": -1.6877762079238892,
"logps/chosen": -274.5986022949219,
"logps/rejected": -289.9263610839844,
"loss": 0.6123,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": 0.11401952803134918,
"rewards/margins": 0.22531266510486603,
"rewards/rejected": -0.11129315197467804,
"step": 70
},
{
"epoch": 0.17,
"grad_norm": 32.48718302712838,
"learning_rate": 4.931986719649298e-07,
"logits/chosen": -1.910599946975708,
"logits/rejected": -1.7989906072616577,
"logps/chosen": -356.32135009765625,
"logps/rejected": -325.3817443847656,
"loss": 0.5878,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07416001707315445,
"rewards/margins": 0.30830469727516174,
"rewards/rejected": -0.3824646770954132,
"step": 80
},
{
"epoch": 0.19,
"grad_norm": 32.46521048247274,
"learning_rate": 4.883222001996351e-07,
"logits/chosen": -1.781141996383667,
"logits/rejected": -1.773406982421875,
"logps/chosen": -326.0487365722656,
"logps/rejected": -370.7205505371094,
"loss": 0.5637,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.45380252599716187,
"rewards/margins": 0.5182568430900574,
"rewards/rejected": -0.9720592498779297,
"step": 90
},
{
"epoch": 0.21,
"grad_norm": 33.51530497027872,
"learning_rate": 4.821741763807186e-07,
"logits/chosen": -1.795566201210022,
"logits/rejected": -1.7746385335922241,
"logps/chosen": -341.0810241699219,
"logps/rejected": -391.9131774902344,
"loss": 0.5671,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6200565695762634,
"rewards/margins": 0.5509090423583984,
"rewards/rejected": -1.1709656715393066,
"step": 100
},
{
"epoch": 0.21,
"eval_logits/chosen": -1.8679490089416504,
"eval_logits/rejected": -1.8570616245269775,
"eval_logps/chosen": -316.96636962890625,
"eval_logps/rejected": -376.7557373046875,
"eval_loss": 0.5698967576026917,
"eval_rewards/accuracies": 0.73046875,
"eval_rewards/chosen": -0.3533283472061157,
"eval_rewards/margins": 0.5366135239601135,
"eval_rewards/rejected": -0.8899418115615845,
"eval_runtime": 97.6563,
"eval_samples_per_second": 20.48,
"eval_steps_per_second": 0.328,
"step": 100
},
{
"epoch": 0.23,
"grad_norm": 34.820943984944364,
"learning_rate": 4.747874028753375e-07,
"logits/chosen": -1.9302442073822021,
"logits/rejected": -1.8041632175445557,
"logps/chosen": -364.3658142089844,
"logps/rejected": -368.28619384765625,
"loss": 0.5779,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.3326733410358429,
"rewards/margins": 0.5019634962081909,
"rewards/rejected": -0.8346366882324219,
"step": 110
},
{
"epoch": 0.25,
"grad_norm": 35.52031238722188,
"learning_rate": 4.662012913161997e-07,
"logits/chosen": -1.8828121423721313,
"logits/rejected": -1.8731359243392944,
"logps/chosen": -346.777099609375,
"logps/rejected": -378.0817565917969,
"loss": 0.544,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.42460617423057556,
"rewards/margins": 0.5200009942054749,
"rewards/rejected": -0.9446069598197937,
"step": 120
},
{
"epoch": 0.27,
"grad_norm": 40.83171596073763,
"learning_rate": 4.5646165232345103e-07,
"logits/chosen": -1.9067420959472656,
"logits/rejected": -1.848259687423706,
"logps/chosen": -353.1668395996094,
"logps/rejected": -412.601806640625,
"loss": 0.5319,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5689653158187866,
"rewards/margins": 0.6179059147834778,
"rewards/rejected": -1.1868712902069092,
"step": 130
},
{
"epoch": 0.29,
"grad_norm": 39.57816446283388,
"learning_rate": 4.456204510851956e-07,
"logits/chosen": -1.79110848903656,
"logits/rejected": -1.710828423500061,
"logps/chosen": -390.3045959472656,
"logps/rejected": -453.116943359375,
"loss": 0.537,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.5443016290664673,
"rewards/margins": 0.7209955453872681,
"rewards/rejected": -1.2652971744537354,
"step": 140
},
{
"epoch": 0.31,
"grad_norm": 45.241736858623206,
"learning_rate": 4.337355301007335e-07,
"logits/chosen": -1.8114426136016846,
"logits/rejected": -1.7426559925079346,
"logps/chosen": -352.48992919921875,
"logps/rejected": -402.91943359375,
"loss": 0.5462,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.5064585208892822,
"rewards/margins": 0.5219663381576538,
"rewards/rejected": -1.028424859046936,
"step": 150
},
{
"epoch": 0.33,
"grad_norm": 73.25214998863763,
"learning_rate": 4.2087030056579986e-07,
"logits/chosen": -1.8390640020370483,
"logits/rejected": -1.7504537105560303,
"logps/chosen": -339.1869812011719,
"logps/rejected": -387.9916076660156,
"loss": 0.5442,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.4455109238624573,
"rewards/margins": 0.7391675710678101,
"rewards/rejected": -1.1846784353256226,
"step": 160
},
{
"epoch": 0.36,
"grad_norm": 48.08778532882697,
"learning_rate": 4.070934040463998e-07,
"logits/chosen": -1.7452236413955688,
"logits/rejected": -1.6487846374511719,
"logps/chosen": -335.72528076171875,
"logps/rejected": -377.5245361328125,
"loss": 0.5304,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6902536749839783,
"rewards/margins": 0.5506319999694824,
"rewards/rejected": -1.2408854961395264,
"step": 170
},
{
"epoch": 0.38,
"grad_norm": 45.566526901622865,
"learning_rate": 3.9247834624635404e-07,
"logits/chosen": -1.5920779705047607,
"logits/rejected": -1.5328117609024048,
"logps/chosen": -352.29937744140625,
"logps/rejected": -390.72100830078125,
"loss": 0.5011,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.8087765574455261,
"rewards/margins": 0.6529080867767334,
"rewards/rejected": -1.4616845846176147,
"step": 180
},
{
"epoch": 0.4,
"grad_norm": 48.30624199959232,
"learning_rate": 3.7710310482256523e-07,
"logits/chosen": -1.7276074886322021,
"logits/rejected": -1.6613149642944336,
"logps/chosen": -347.87579345703125,
"logps/rejected": -405.24237060546875,
"loss": 0.5308,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5396801829338074,
"rewards/margins": 0.6805658936500549,
"rewards/rejected": -1.2202460765838623,
"step": 190
},
{
"epoch": 0.42,
"grad_norm": 99.6040419345467,
"learning_rate": 3.610497133404795e-07,
"logits/chosen": -1.7740137577056885,
"logits/rejected": -1.7177015542984009,
"logps/chosen": -344.6033020019531,
"logps/rejected": -404.29229736328125,
"loss": 0.5413,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6725525856018066,
"rewards/margins": 0.6624492406845093,
"rewards/rejected": -1.3350017070770264,
"step": 200
},
{
"epoch": 0.42,
"eval_logits/chosen": -1.901658296585083,
"eval_logits/rejected": -1.8679291009902954,
"eval_logps/chosen": -349.037841796875,
"eval_logps/rejected": -432.6194152832031,
"eval_loss": 0.5253521800041199,
"eval_rewards/accuracies": 0.7265625,
"eval_rewards/chosen": -0.6740425825119019,
"eval_rewards/margins": 0.7745361328125,
"eval_rewards/rejected": -1.4485788345336914,
"eval_runtime": 97.5006,
"eval_samples_per_second": 20.513,
"eval_steps_per_second": 0.328,
"step": 200
},
{
"epoch": 0.44,
"grad_norm": 46.68866608504909,
"learning_rate": 3.4440382358952115e-07,
"logits/chosen": -1.734480619430542,
"logits/rejected": -1.6646308898925781,
"logps/chosen": -384.4491882324219,
"logps/rejected": -421.3724670410156,
"loss": 0.5373,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7106617093086243,
"rewards/margins": 0.7391539812088013,
"rewards/rejected": -1.4498156309127808,
"step": 210
},
{
"epoch": 0.46,
"grad_norm": 44.67370083595421,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -1.7060235738754272,
"logits/rejected": -1.621319055557251,
"logps/chosen": -333.583740234375,
"logps/rejected": -387.3582458496094,
"loss": 0.5233,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.5107825994491577,
"rewards/margins": 0.7757614850997925,
"rewards/rejected": -1.2865440845489502,
"step": 220
},
{
"epoch": 0.48,
"grad_norm": 44.602377758622936,
"learning_rate": 3.096924887558854e-07,
"logits/chosen": -1.672357201576233,
"logits/rejected": -1.677425742149353,
"logps/chosen": -334.2008361816406,
"logps/rejected": -428.0926208496094,
"loss": 0.5239,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6795379519462585,
"rewards/margins": 0.8884698152542114,
"rewards/rejected": -1.5680078268051147,
"step": 230
},
{
"epoch": 0.5,
"grad_norm": 43.82303533573589,
"learning_rate": 2.9181224366319943e-07,
"logits/chosen": -1.6859185695648193,
"logits/rejected": -1.6255781650543213,
"logps/chosen": -357.11773681640625,
"logps/rejected": -421.4244079589844,
"loss": 0.4902,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6822614073753357,
"rewards/margins": 0.8219982385635376,
"rewards/rejected": -1.5042595863342285,
"step": 240
},
{
"epoch": 0.52,
"grad_norm": 46.68066851465082,
"learning_rate": 2.7370891215954565e-07,
"logits/chosen": -1.5676295757293701,
"logits/rejected": -1.4538037776947021,
"logps/chosen": -401.59979248046875,
"logps/rejected": -471.2294006347656,
"loss": 0.5154,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.7957779765129089,
"rewards/margins": 1.0585238933563232,
"rewards/rejected": -1.8543018102645874,
"step": 250
},
{
"epoch": 0.54,
"grad_norm": 45.74080164598797,
"learning_rate": 2.55479083351317e-07,
"logits/chosen": -1.5453943014144897,
"logits/rejected": -1.3946092128753662,
"logps/chosen": -411.67681884765625,
"logps/rejected": -464.185791015625,
"loss": 0.5124,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.9510448575019836,
"rewards/margins": 0.9077135324478149,
"rewards/rejected": -1.8587583303451538,
"step": 260
},
{
"epoch": 0.56,
"grad_norm": 35.67215071482242,
"learning_rate": 2.3722002126275822e-07,
"logits/chosen": -1.4410674571990967,
"logits/rejected": -1.4173917770385742,
"logps/chosen": -389.84442138671875,
"logps/rejected": -446.63946533203125,
"loss": 0.5176,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.882165253162384,
"rewards/margins": 0.7578494548797607,
"rewards/rejected": -1.6400146484375,
"step": 270
},
{
"epoch": 0.59,
"grad_norm": 45.13283149696396,
"learning_rate": 2.19029145890313e-07,
"logits/chosen": -1.3330192565917969,
"logits/rejected": -1.2097164392471313,
"logps/chosen": -361.392578125,
"logps/rejected": -428.5855407714844,
"loss": 0.5182,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7854728102684021,
"rewards/margins": 0.8376423716545105,
"rewards/rejected": -1.6231151819229126,
"step": 280
},
{
"epoch": 0.61,
"grad_norm": 42.58221661061121,
"learning_rate": 2.0100351342479216e-07,
"logits/chosen": -1.3783751726150513,
"logits/rejected": -1.3098156452178955,
"logps/chosen": -341.5384521484375,
"logps/rejected": -416.83050537109375,
"loss": 0.5035,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.7412964105606079,
"rewards/margins": 0.8173438906669617,
"rewards/rejected": -1.5586402416229248,
"step": 290
},
{
"epoch": 0.63,
"grad_norm": 44.44135138330837,
"learning_rate": 1.8323929841460178e-07,
"logits/chosen": -1.4361331462860107,
"logits/rejected": -1.2948487997055054,
"logps/chosen": -415.39324951171875,
"logps/rejected": -463.1932067871094,
"loss": 0.4829,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9276901483535767,
"rewards/margins": 0.8659119606018066,
"rewards/rejected": -1.7936019897460938,
"step": 300
},
{
"epoch": 0.63,
"eval_logits/chosen": -1.256626844406128,
"eval_logits/rejected": -1.199381709098816,
"eval_logps/chosen": -368.2399597167969,
"eval_logps/rejected": -477.2876892089844,
"eval_loss": 0.49556368589401245,
"eval_rewards/accuracies": 0.78125,
"eval_rewards/chosen": -0.8660640716552734,
"eval_rewards/margins": 1.0291972160339355,
"eval_rewards/rejected": -1.895261287689209,
"eval_runtime": 97.5907,
"eval_samples_per_second": 20.494,
"eval_steps_per_second": 0.328,
"step": 300
},
{
"epoch": 0.65,
"grad_norm": 49.20598478293576,
"learning_rate": 1.6583128063291573e-07,
"logits/chosen": -1.070894479751587,
"logits/rejected": -0.999220073223114,
"logps/chosen": -422.22509765625,
"logps/rejected": -478.1600646972656,
"loss": 0.4882,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0501785278320312,
"rewards/margins": 0.9182316660881042,
"rewards/rejected": -1.9684101343154907,
"step": 310
},
{
"epoch": 0.67,
"grad_norm": 49.52786644109137,
"learning_rate": 1.488723393865766e-07,
"logits/chosen": -0.9732829332351685,
"logits/rejected": -0.8598931431770325,
"logps/chosen": -423.58465576171875,
"logps/rejected": -463.65087890625,
"loss": 0.482,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0610870122909546,
"rewards/margins": 0.8992059826850891,
"rewards/rejected": -1.960293173789978,
"step": 320
},
{
"epoch": 0.69,
"grad_norm": 44.16474950280745,
"learning_rate": 1.3245295796480788e-07,
"logits/chosen": -1.1072012186050415,
"logits/rejected": -0.9854669570922852,
"logps/chosen": -383.697509765625,
"logps/rejected": -467.64630126953125,
"loss": 0.4809,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.015866756439209,
"rewards/margins": 0.7990261316299438,
"rewards/rejected": -1.8148927688598633,
"step": 330
},
{
"epoch": 0.71,
"grad_norm": 49.790170416193874,
"learning_rate": 1.1666074087171627e-07,
"logits/chosen": -1.0710186958312988,
"logits/rejected": -0.9443724751472473,
"logps/chosen": -406.4459228515625,
"logps/rejected": -490.1005859375,
"loss": 0.4931,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9062315821647644,
"rewards/margins": 0.9880655407905579,
"rewards/rejected": -1.8942972421646118,
"step": 340
},
{
"epoch": 0.73,
"grad_norm": 45.78788884909769,
"learning_rate": 1.0157994641835734e-07,
"logits/chosen": -0.964527428150177,
"logits/rejected": -0.8877021670341492,
"logps/chosen": -366.8417053222656,
"logps/rejected": -448.39239501953125,
"loss": 0.4708,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8918215036392212,
"rewards/margins": 0.9880329966545105,
"rewards/rejected": -1.879854440689087,
"step": 350
},
{
"epoch": 0.75,
"grad_norm": 47.523486254775236,
"learning_rate": 8.729103716819111e-08,
"logits/chosen": -1.099103331565857,
"logits/rejected": -0.9152529835700989,
"logps/chosen": -418.78790283203125,
"logps/rejected": -472.5894470214844,
"loss": 0.5139,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9209517240524292,
"rewards/margins": 0.9369718432426453,
"rewards/rejected": -1.8579237461090088,
"step": 360
},
{
"epoch": 0.77,
"grad_norm": 40.46200259764798,
"learning_rate": 7.387025063449081e-08,
"logits/chosen": -1.0258140563964844,
"logits/rejected": -0.9037224054336548,
"logps/chosen": -389.7789611816406,
"logps/rejected": -427.80902099609375,
"loss": 0.501,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9432722926139832,
"rewards/margins": 0.7514128684997559,
"rewards/rejected": -1.6946852207183838,
"step": 370
},
{
"epoch": 0.79,
"grad_norm": 44.99044596346264,
"learning_rate": 6.138919252022435e-08,
"logits/chosen": -0.9202815294265747,
"logits/rejected": -0.9092128872871399,
"logps/chosen": -369.4691467285156,
"logps/rejected": -499.80047607421875,
"loss": 0.4856,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1355737447738647,
"rewards/margins": 1.0553382635116577,
"rewards/rejected": -2.1909122467041016,
"step": 380
},
{
"epoch": 0.82,
"grad_norm": 46.73184407203235,
"learning_rate": 4.991445467064689e-08,
"logits/chosen": -1.017165184020996,
"logits/rejected": -0.9522297978401184,
"logps/chosen": -429.59124755859375,
"logps/rejected": -499.8984375,
"loss": 0.4844,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0497267246246338,
"rewards/margins": 0.8575556874275208,
"rewards/rejected": -1.9072824716567993,
"step": 390
},
{
"epoch": 0.84,
"grad_norm": 45.88759783660656,
"learning_rate": 3.9507259776993954e-08,
"logits/chosen": -0.8822342753410339,
"logits/rejected": -0.7616764307022095,
"logps/chosen": -393.79986572265625,
"logps/rejected": -488.7137145996094,
"loss": 0.4981,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0166757106781006,
"rewards/margins": 0.9999138116836548,
"rewards/rejected": -2.016589403152466,
"step": 400
},
{
"epoch": 0.84,
"eval_logits/chosen": -1.0730373859405518,
"eval_logits/rejected": -0.9850106239318848,
"eval_logps/chosen": -365.3529357910156,
"eval_logps/rejected": -476.14508056640625,
"eval_loss": 0.49130114912986755,
"eval_rewards/accuracies": 0.78515625,
"eval_rewards/chosen": -0.8371938467025757,
"eval_rewards/margins": 1.0466417074203491,
"eval_rewards/rejected": -1.8838355541229248,
"eval_runtime": 97.6225,
"eval_samples_per_second": 20.487,
"eval_steps_per_second": 0.328,
"step": 400
},
{
"epoch": 0.86,
"grad_norm": 44.331882429947925,
"learning_rate": 3.022313472693447e-08,
"logits/chosen": -1.0954724550247192,
"logits/rejected": -0.854290783405304,
"logps/chosen": -407.5718078613281,
"logps/rejected": -482.7383728027344,
"loss": 0.4921,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8705843687057495,
"rewards/margins": 1.0231659412384033,
"rewards/rejected": -1.8937501907348633,
"step": 410
},
{
"epoch": 0.88,
"grad_norm": 48.31749590006741,
"learning_rate": 2.2111614344599684e-08,
"logits/chosen": -1.0192204713821411,
"logits/rejected": -0.973158061504364,
"logps/chosen": -416.341552734375,
"logps/rejected": -486.69232177734375,
"loss": 0.4856,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.9502483606338501,
"rewards/margins": 0.9265721440315247,
"rewards/rejected": -1.8768205642700195,
"step": 420
},
{
"epoch": 0.9,
"grad_norm": 40.281913550333705,
"learning_rate": 1.521597710086439e-08,
"logits/chosen": -0.9044865369796753,
"logits/rejected": -0.8032494783401489,
"logps/chosen": -405.5864562988281,
"logps/rejected": -480.13201904296875,
"loss": 0.4776,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.037095546722412,
"rewards/margins": 0.9771502614021301,
"rewards/rejected": -2.0142457485198975,
"step": 430
},
{
"epoch": 0.92,
"grad_norm": 43.058313272164526,
"learning_rate": 9.57301420397924e-09,
"logits/chosen": -0.9727311134338379,
"logits/rejected": -0.8283950090408325,
"logps/chosen": -393.12823486328125,
"logps/rejected": -472.7400817871094,
"loss": 0.4944,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8835296630859375,
"rewards/margins": 0.8925860524177551,
"rewards/rejected": -1.7761156558990479,
"step": 440
},
{
"epoch": 0.94,
"grad_norm": 45.17569103872668,
"learning_rate": 5.212833302556258e-09,
"logits/chosen": -0.8834640383720398,
"logits/rejected": -0.8035561442375183,
"logps/chosen": -416.1705627441406,
"logps/rejected": -517.479248046875,
"loss": 0.4973,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0344994068145752,
"rewards/margins": 0.8305438756942749,
"rewards/rejected": -1.8650434017181396,
"step": 450
},
{
"epoch": 0.96,
"grad_norm": 57.18112420515564,
"learning_rate": 2.158697848236607e-09,
"logits/chosen": -0.9279729723930359,
"logits/rejected": -0.8204873204231262,
"logps/chosen": -390.6974182128906,
"logps/rejected": -443.84051513671875,
"loss": 0.4944,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9189409017562866,
"rewards/margins": 0.8423686027526855,
"rewards/rejected": -1.7613098621368408,
"step": 460
},
{
"epoch": 0.98,
"grad_norm": 40.005457130126345,
"learning_rate": 4.269029751107489e-10,
"logits/chosen": -0.9693315625190735,
"logits/rejected": -0.8152003288269043,
"logps/chosen": -384.0590515136719,
"logps/rejected": -482.89630126953125,
"loss": 0.4792,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8925178647041321,
"rewards/margins": 0.9355740547180176,
"rewards/rejected": -1.8280918598175049,
"step": 470
},
{
"epoch": 1.0,
"step": 478,
"total_flos": 0.0,
"train_loss": 0.5347933170685708,
"train_runtime": 7634.2165,
"train_samples_per_second": 8.008,
"train_steps_per_second": 0.063
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}