{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 49.891043665102934, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.7660439014434814, "logits/rejected": -2.717564582824707, "logps/chosen": -269.8568420410156, "logps/rejected": -360.52459716796875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 46.946091297352105, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.592543125152588, "logits/rejected": -2.56319522857666, "logps/chosen": -264.7040100097656, "logps/rejected": -251.515625, "loss": 0.6933, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": 0.004693002440035343, "rewards/margins": 0.0028277651872485876, "rewards/rejected": 0.0018652371363714337, "step": 10 }, { "epoch": 0.04, "grad_norm": 41.817724108185395, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.65449595451355, "logits/rejected": -2.6068952083587646, "logps/chosen": -280.5221252441406, "logps/rejected": -295.92376708984375, "loss": 0.689, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.05156273767352104, "rewards/margins": 0.00740828737616539, "rewards/rejected": 0.04415445029735565, "step": 20 }, { "epoch": 0.06, "grad_norm": 39.81553425430633, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6671488285064697, "logits/rejected": -2.5955922603607178, "logps/chosen": -296.41644287109375, "logps/rejected": -260.6401672363281, "loss": 0.6733, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2127685844898224, "rewards/margins": 0.04726782441139221, "rewards/rejected": 0.16550076007843018, "step": 30 }, { "epoch": 0.08, "grad_norm": 38.58454153774096, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.5658886432647705, "logits/rejected": -2.5324325561523438, "logps/chosen": -259.78594970703125, "logps/rejected": -241.00991821289062, "loss": 0.6399, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3669721484184265, "rewards/margins": 0.19786901772022247, "rewards/rejected": 0.16910310089588165, "step": 40 }, { "epoch": 0.1, "grad_norm": 37.351752662935816, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.5195257663726807, "logits/rejected": -2.4827651977539062, "logps/chosen": -273.65081787109375, "logps/rejected": -290.78680419921875, "loss": 0.6094, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.304054319858551, "rewards/margins": 0.2041884958744049, "rewards/rejected": 0.09986577928066254, "step": 50 }, { "epoch": 0.13, "grad_norm": 39.61129660699584, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.567991018295288, "logits/rejected": -2.5036864280700684, "logps/chosen": -260.38055419921875, "logps/rejected": -294.011474609375, "loss": 0.6013, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5612996220588684, "rewards/margins": 0.3578048348426819, "rewards/rejected": 0.20349478721618652, "step": 60 }, { "epoch": 0.15, "grad_norm": 41.460696281749556, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.460195302963257, "logits/rejected": -2.46120023727417, "logps/chosen": -253.1399383544922, "logps/rejected": -253.4242706298828, "loss": 0.5693, "rewards/accuracies": 0.78125, "rewards/chosen": 0.6238263845443726, "rewards/margins": 0.4591788649559021, "rewards/rejected": 0.16464750468730927, "step": 70 }, { "epoch": 0.17, "grad_norm": 61.37030849441711, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.615948438644409, "logits/rejected": -2.5394978523254395, "logps/chosen": -311.7240295410156, "logps/rejected": -263.1805725097656, "loss": 0.5671, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5546952486038208, "rewards/margins": 0.5107932686805725, "rewards/rejected": 0.04390193149447441, "step": 80 }, { "epoch": 0.19, "grad_norm": 39.59727717104598, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.5085294246673584, "logits/rejected": -2.4543616771698, "logps/chosen": -251.203369140625, "logps/rejected": -259.8647766113281, "loss": 0.5646, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.3953971564769745, "rewards/margins": 0.7687323689460754, "rewards/rejected": -0.37333518266677856, "step": 90 }, { "epoch": 0.21, "grad_norm": 36.57841721590594, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.499514102935791, "logits/rejected": -2.4649369716644287, "logps/chosen": -248.44363403320312, "logps/rejected": -257.64776611328125, "loss": 0.565, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.5957446694374084, "rewards/margins": 0.6267115473747253, "rewards/rejected": -0.03096688725054264, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.550398111343384, "eval_logits/rejected": -2.5104503631591797, "eval_logps/chosen": -250.69297790527344, "eval_logps/rejected": -262.7791748046875, "eval_loss": 0.5717624425888062, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": 0.5950239300727844, "eval_rewards/margins": 0.6006231904029846, "eval_rewards/rejected": -0.005599223077297211, "eval_runtime": 96.9486, "eval_samples_per_second": 20.629, "eval_steps_per_second": 0.33, "step": 100 }, { "epoch": 0.23, "grad_norm": 51.91494841998397, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.55851149559021, "logits/rejected": -2.4656014442443848, "logps/chosen": -292.62615966796875, "logps/rejected": -258.59661865234375, "loss": 0.5713, "rewards/accuracies": 0.75, "rewards/chosen": 0.5976964831352234, "rewards/margins": 0.6357330083847046, "rewards/rejected": -0.0380365327000618, "step": 110 }, { "epoch": 0.25, "grad_norm": 70.69069363258822, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.4600424766540527, "logits/rejected": -2.4324684143066406, "logps/chosen": -270.7308349609375, "logps/rejected": -260.5433349609375, "loss": 0.5497, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5218156576156616, "rewards/margins": 0.5561539530754089, "rewards/rejected": -0.03433822840452194, "step": 120 }, { "epoch": 0.27, "grad_norm": 42.312370253489476, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.464791774749756, "logits/rejected": -2.439894676208496, "logps/chosen": -268.9382019042969, "logps/rejected": -269.9627685546875, "loss": 0.5423, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6532469987869263, "rewards/margins": 0.7295945882797241, "rewards/rejected": -0.07634757459163666, "step": 130 }, { "epoch": 0.29, "grad_norm": 40.45859260542855, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.5265681743621826, "logits/rejected": -2.485774517059326, "logps/chosen": -303.1440124511719, "logps/rejected": -301.68914794921875, "loss": 0.5376, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.6410696506500244, "rewards/margins": 0.6916864514350891, "rewards/rejected": -0.0506168007850647, "step": 140 }, { "epoch": 0.31, "grad_norm": 41.1747855806655, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.5189616680145264, "logits/rejected": -2.4531705379486084, "logps/chosen": -272.0736999511719, "logps/rejected": -276.2969055175781, "loss": 0.5442, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5575242042541504, "rewards/margins": 0.5619192719459534, "rewards/rejected": -0.004395070485770702, "step": 150 }, { "epoch": 0.33, "grad_norm": 48.726180323544725, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.5174994468688965, "logits/rejected": -2.43558406829834, "logps/chosen": -260.0892028808594, "logps/rejected": -260.7149658203125, "loss": 0.5652, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.6489425897598267, "rewards/margins": 0.8134964108467102, "rewards/rejected": -0.16455380618572235, "step": 160 }, { "epoch": 0.36, "grad_norm": 49.53825953706789, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.4509148597717285, "logits/rejected": -2.3897039890289307, "logps/chosen": -239.52261352539062, "logps/rejected": -233.6277618408203, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.49063143134117126, "rewards/margins": 0.6157802939414978, "rewards/rejected": -0.12514881789684296, "step": 170 }, { "epoch": 0.38, "grad_norm": 51.08561061111303, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.3483898639678955, "logits/rejected": -2.306784152984619, "logps/chosen": -247.6396026611328, "logps/rejected": -231.8523406982422, "loss": 0.5181, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.43596941232681274, "rewards/margins": 0.6500319242477417, "rewards/rejected": -0.21406252682209015, "step": 180 }, { "epoch": 0.4, "grad_norm": 42.31027201995276, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.41634464263916, "logits/rejected": -2.378105878829956, "logps/chosen": -260.20306396484375, "logps/rejected": -261.46502685546875, "loss": 0.5392, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.5519876480102539, "rewards/margins": 0.6375012993812561, "rewards/rejected": -0.08551368862390518, "step": 190 }, { "epoch": 0.42, "grad_norm": 102.86207924802177, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.392763614654541, "logits/rejected": -2.381993532180786, "logps/chosen": -249.912109375, "logps/rejected": -256.75439453125, "loss": 0.5467, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.49922746419906616, "rewards/margins": 0.7344967126846313, "rewards/rejected": -0.235269233584404, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -2.517864942550659, "eval_logits/rejected": -2.4783387184143066, "eval_logps/chosen": -249.6370849609375, "eval_logps/rejected": -264.89788818359375, "eval_loss": 0.5432960391044617, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": 0.6478186845779419, "eval_rewards/margins": 0.759353518486023, "eval_rewards/rejected": -0.11153475195169449, "eval_runtime": 96.4207, "eval_samples_per_second": 20.742, "eval_steps_per_second": 0.332, "step": 200 }, { "epoch": 0.44, "grad_norm": 45.308290366409736, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.4460113048553467, "logits/rejected": -2.391810894012451, "logps/chosen": -278.56781005859375, "logps/rejected": -257.2254943847656, "loss": 0.5436, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.5562152862548828, "rewards/margins": 0.8551079034805298, "rewards/rejected": -0.29889267683029175, "step": 210 }, { "epoch": 0.46, "grad_norm": 50.1182470431882, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.4605488777160645, "logits/rejected": -2.42708683013916, "logps/chosen": -257.90826416015625, "logps/rejected": -253.3182830810547, "loss": 0.54, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.3734419643878937, "rewards/margins": 0.7561658024787903, "rewards/rejected": -0.382723867893219, "step": 220 }, { "epoch": 0.48, "grad_norm": 43.71024962971359, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.490509510040283, "logits/rejected": -2.4491913318634033, "logps/chosen": -237.17898559570312, "logps/rejected": -251.81686401367188, "loss": 0.5441, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.5843235850334167, "rewards/margins": 0.7882751226425171, "rewards/rejected": -0.20395155251026154, "step": 230 }, { "epoch": 0.5, "grad_norm": 44.93616969967234, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.533695697784424, "logits/rejected": -2.500807285308838, "logps/chosen": -253.635498046875, "logps/rejected": -253.0944061279297, "loss": 0.5142, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5032340884208679, "rewards/margins": 0.7045356035232544, "rewards/rejected": -0.2013014256954193, "step": 240 }, { "epoch": 0.52, "grad_norm": 42.68904256130122, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.483025074005127, "logits/rejected": -2.4015185832977295, "logps/chosen": -285.0963439941406, "logps/rejected": -263.43560791015625, "loss": 0.5198, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.6547069549560547, "rewards/margins": 0.850358784198761, "rewards/rejected": -0.19565197825431824, "step": 250 }, { "epoch": 0.54, "grad_norm": 45.43502171857602, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.516913890838623, "logits/rejected": -2.478473424911499, "logps/chosen": -282.80230712890625, "logps/rejected": -258.77288818359375, "loss": 0.5235, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.5754625797271729, "rewards/margins": 0.8150871396064758, "rewards/rejected": -0.23962458968162537, "step": 260 }, { "epoch": 0.56, "grad_norm": 41.73526734917468, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.5381340980529785, "logits/rejected": -2.4941086769104004, "logps/chosen": -267.4333190917969, "logps/rejected": -260.50677490234375, "loss": 0.5406, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4781308174133301, "rewards/margins": 0.6212563514709473, "rewards/rejected": -0.14312560856342316, "step": 270 }, { "epoch": 0.59, "grad_norm": 48.561323508433155, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.510133743286133, "logits/rejected": -2.4422435760498047, "logps/chosen": -250.73855590820312, "logps/rejected": -247.487060546875, "loss": 0.5599, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.5469261407852173, "rewards/margins": 0.8119627833366394, "rewards/rejected": -0.2650366425514221, "step": 280 }, { "epoch": 0.61, "grad_norm": 44.504093632124075, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.5589568614959717, "logits/rejected": -2.5217483043670654, "logps/chosen": -240.7520751953125, "logps/rejected": -244.8422088623047, "loss": 0.5354, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.5167636871337891, "rewards/margins": 0.661081075668335, "rewards/rejected": -0.14431743323802948, "step": 290 }, { "epoch": 0.63, "grad_norm": 52.52022669231452, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.5429511070251465, "logits/rejected": -2.472679376602173, "logps/chosen": -292.2240905761719, "logps/rejected": -266.68658447265625, "loss": 0.517, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.4385985732078552, "rewards/margins": 0.7676541209220886, "rewards/rejected": -0.329055517911911, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -2.5622596740722656, "eval_logits/rejected": -2.520256280899048, "eval_logps/chosen": -251.2219696044922, "eval_logps/rejected": -268.04449462890625, "eval_loss": 0.53697669506073, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": 0.5685745477676392, "eval_rewards/margins": 0.8374388217926025, "eval_rewards/rejected": -0.2688642740249634, "eval_runtime": 96.3678, "eval_samples_per_second": 20.754, "eval_steps_per_second": 0.332, "step": 300 }, { "epoch": 0.65, "grad_norm": 43.866661437938184, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.4593474864959717, "logits/rejected": -2.443233013153076, "logps/chosen": -285.5498046875, "logps/rejected": -263.8379821777344, "loss": 0.5077, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.4893050193786621, "rewards/margins": 0.7553777098655701, "rewards/rejected": -0.26607269048690796, "step": 310 }, { "epoch": 0.67, "grad_norm": 43.407860217947494, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.4746253490448, "logits/rejected": -2.4388270378112793, "logps/chosen": -283.4583740234375, "logps/rejected": -250.38204956054688, "loss": 0.504, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.5105848908424377, "rewards/margins": 0.788524329662323, "rewards/rejected": -0.2779393792152405, "step": 320 }, { "epoch": 0.69, "grad_norm": 40.302692173545196, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.4712371826171875, "logits/rejected": -2.4099698066711426, "logps/chosen": -252.349853515625, "logps/rejected": -264.03912353515625, "loss": 0.5242, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4930170178413391, "rewards/margins": 0.7200408577919006, "rewards/rejected": -0.2270239144563675, "step": 330 }, { "epoch": 0.71, "grad_norm": 50.168955016672676, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.467729091644287, "logits/rejected": -2.4046943187713623, "logps/chosen": -278.697509765625, "logps/rejected": -285.4507141113281, "loss": 0.524, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.5648467540740967, "rewards/margins": 0.8352931141853333, "rewards/rejected": -0.2704463601112366, "step": 340 }, { "epoch": 0.73, "grad_norm": 46.15971070553052, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.445666790008545, "logits/rejected": -2.377004384994507, "logps/chosen": -248.63241577148438, "logps/rejected": -248.23904418945312, "loss": 0.4924, "rewards/accuracies": 0.75, "rewards/chosen": 0.47894006967544556, "rewards/margins": 0.8554509878158569, "rewards/rejected": -0.37651100754737854, "step": 350 }, { "epoch": 0.75, "grad_norm": 54.17198760484943, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.4745380878448486, "logits/rejected": -2.376185178756714, "logps/chosen": -292.89483642578125, "logps/rejected": -269.1952209472656, "loss": 0.5388, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.4898607134819031, "rewards/margins": 0.8843740224838257, "rewards/rejected": -0.3945133090019226, "step": 360 }, { "epoch": 0.77, "grad_norm": 44.15468601338237, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.409170150756836, "logits/rejected": -2.367518186569214, "logps/chosen": -266.35430908203125, "logps/rejected": -242.5480194091797, "loss": 0.5384, "rewards/accuracies": 0.6875, "rewards/chosen": 0.44893550872802734, "rewards/margins": 0.6646324992179871, "rewards/rejected": -0.21569697558879852, "step": 370 }, { "epoch": 0.79, "grad_norm": 39.47383320898196, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.3523006439208984, "logits/rejected": -2.3420968055725098, "logps/chosen": -230.9795379638672, "logps/rejected": -267.8912658691406, "loss": 0.5181, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.4252557158470154, "rewards/margins": 0.8839966058731079, "rewards/rejected": -0.45874080061912537, "step": 380 }, { "epoch": 0.82, "grad_norm": 48.64961363299689, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.4286305904388428, "logits/rejected": -2.394604206085205, "logps/chosen": -293.20440673828125, "logps/rejected": -287.0997009277344, "loss": 0.5149, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.6216251850128174, "rewards/margins": 0.7780872583389282, "rewards/rejected": -0.15646204352378845, "step": 390 }, { "epoch": 0.84, "grad_norm": 44.38080817079193, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.4102301597595215, "logits/rejected": -2.3357295989990234, "logps/chosen": -259.7147521972656, "logps/rejected": -273.10699462890625, "loss": 0.518, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.541024386882782, "rewards/margins": 0.8457515835762024, "rewards/rejected": -0.30472710728645325, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -2.4731171131134033, "eval_logits/rejected": -2.4323782920837402, "eval_logps/chosen": -250.02178955078125, "eval_logps/rejected": -267.0915222167969, "eval_loss": 0.5348160862922668, "eval_rewards/accuracies": 0.75390625, "eval_rewards/chosen": 0.6285843849182129, "eval_rewards/margins": 0.8498014211654663, "eval_rewards/rejected": -0.22121697664260864, "eval_runtime": 96.4764, "eval_samples_per_second": 20.73, "eval_steps_per_second": 0.332, "step": 400 }, { "epoch": 0.86, "grad_norm": 48.15981350653104, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.444577932357788, "logits/rejected": -2.3699073791503906, "logps/chosen": -286.5138854980469, "logps/rejected": -274.3666687011719, "loss": 0.5226, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.6201778650283813, "rewards/margins": 0.8976529240608215, "rewards/rejected": -0.2774750590324402, "step": 410 }, { "epoch": 0.88, "grad_norm": 48.573506313099124, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.429912805557251, "logits/rejected": -2.3931796550750732, "logps/chosen": -287.13067626953125, "logps/rejected": -279.46844482421875, "loss": 0.5212, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4971562325954437, "rewards/margins": 0.7474662065505981, "rewards/rejected": -0.25030994415283203, "step": 420 }, { "epoch": 0.9, "grad_norm": 41.98038749926915, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.3573684692382812, "logits/rejected": -2.3092567920684814, "logps/chosen": -269.9436950683594, "logps/rejected": -265.4564514160156, "loss": 0.501, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.45472264289855957, "rewards/margins": 0.838543713092804, "rewards/rejected": -0.38382115960121155, "step": 430 }, { "epoch": 0.92, "grad_norm": 44.22650678163462, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.4332785606384277, "logits/rejected": -2.3776473999023438, "logps/chosen": -272.65960693359375, "logps/rejected": -271.44329833984375, "loss": 0.5213, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5838757157325745, "rewards/margins": 0.700454831123352, "rewards/rejected": -0.1165790781378746, "step": 440 }, { "epoch": 0.94, "grad_norm": 43.00589727019739, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.3836779594421387, "logits/rejected": -2.360665798187256, "logps/chosen": -284.2134704589844, "logps/rejected": -312.9830627441406, "loss": 0.5099, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4735330641269684, "rewards/margins": 0.7689631581306458, "rewards/rejected": -0.29543009400367737, "step": 450 }, { "epoch": 0.96, "grad_norm": 46.86754726240038, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.417273998260498, "logits/rejected": -2.377349376678467, "logps/chosen": -262.1804504394531, "logps/rejected": -247.7431182861328, "loss": 0.5264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.48920711874961853, "rewards/margins": 0.7419728636741638, "rewards/rejected": -0.2527657151222229, "step": 460 }, { "epoch": 0.98, "grad_norm": 45.055740606082026, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.4338390827178955, "logits/rejected": -2.3758208751678467, "logps/chosen": -268.4836730957031, "logps/rejected": -289.60205078125, "loss": 0.4974, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.5107508897781372, "rewards/margins": 0.8732994794845581, "rewards/rejected": -0.3625485301017761, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5478911828795238, "train_runtime": 7553.9268, "train_samples_per_second": 8.093, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }