{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 43894.48099242753, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.689455509185791, "logits/rejected": -1.4794573783874512, "logps/chosen": -126.21005249023438, "logps/rejected": -98.13133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 32305.118552441847, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.7068803310394287, "logits/rejected": -1.6096948385238647, "logps/chosen": -139.68423461914062, "logps/rejected": -91.41385650634766, "loss": 2.648, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.3865443468093872, "rewards/margins": 1.56412935256958, "rewards/rejected": -1.1775851249694824, "step": 10 }, { "epoch": 0.04, "grad_norm": 12815.76079839475, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.6411230564117432, "logits/rejected": -1.6499197483062744, "logps/chosen": -131.1981964111328, "logps/rejected": -93.75257110595703, "loss": 0.8229, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 14.764315605163574, "rewards/margins": 19.453596115112305, "rewards/rejected": -4.68928337097168, "step": 20 }, { "epoch": 0.06, "grad_norm": 8663.076785137986, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.7121353149414062, "logits/rejected": -1.6375898122787476, "logps/chosen": -133.71029663085938, "logps/rejected": -103.07096099853516, "loss": 0.5133, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 44.873260498046875, "rewards/margins": 58.086036682128906, "rewards/rejected": -13.2127685546875, "step": 30 }, { "epoch": 0.08, "grad_norm": 9873.577427815002, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.6784517765045166, "logits/rejected": -1.5826914310455322, "logps/chosen": -145.05630493164062, "logps/rejected": -101.44771575927734, "loss": 0.516, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 66.6468734741211, "rewards/margins": 95.41236114501953, "rewards/rejected": -28.765483856201172, "step": 40 }, { "epoch": 0.1, "grad_norm": 15484.454406367853, "learning_rate": 4.999733114418725e-07, "logits/chosen": -1.6393781900405884, "logits/rejected": -1.6535584926605225, "logps/chosen": -127.37105560302734, "logps/rejected": -104.55952453613281, "loss": 0.6497, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 77.03364562988281, "rewards/margins": 131.46775817871094, "rewards/rejected": -54.434104919433594, "step": 50 }, { "epoch": 0.13, "grad_norm": 4641.588943610254, "learning_rate": 4.990398100856366e-07, "logits/chosen": -1.7217296361923218, "logits/rejected": -1.651254653930664, "logps/chosen": -141.35108947753906, "logps/rejected": -108.5528793334961, "loss": 0.7574, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 96.49129486083984, "rewards/margins": 164.4349822998047, "rewards/rejected": -67.94366455078125, "step": 60 }, { "epoch": 0.15, "grad_norm": 10223.596716214304, "learning_rate": 4.967775735898179e-07, "logits/chosen": -1.6282870769500732, "logits/rejected": -1.6370842456817627, "logps/chosen": -134.8829803466797, "logps/rejected": -106.41259765625, "loss": 1.0394, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 102.15782928466797, "rewards/margins": 186.63003540039062, "rewards/rejected": -84.47221374511719, "step": 70 }, { "epoch": 0.17, "grad_norm": 6360.665892121058, "learning_rate": 4.931986719649298e-07, "logits/chosen": -1.5932952165603638, "logits/rejected": -1.5602772235870361, "logps/chosen": -130.39671325683594, "logps/rejected": -101.85746002197266, "loss": 1.1921, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 100.63119506835938, "rewards/margins": 181.93630981445312, "rewards/rejected": -81.30511474609375, "step": 80 }, { "epoch": 0.19, "grad_norm": 12014.555301109034, "learning_rate": 4.883222001996351e-07, "logits/chosen": -1.6406339406967163, "logits/rejected": -1.6412605047225952, "logps/chosen": -138.44619750976562, "logps/rejected": -112.9968032836914, "loss": 0.9751, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 107.64051818847656, "rewards/margins": 218.6805419921875, "rewards/rejected": -111.0400390625, "step": 90 }, { "epoch": 0.21, "grad_norm": 11340.576903436586, "learning_rate": 4.821741763807186e-07, "logits/chosen": -1.6779956817626953, "logits/rejected": -1.6324456930160522, "logps/chosen": -118.78487396240234, "logps/rejected": -101.80384826660156, "loss": 1.1504, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 91.92900848388672, "rewards/margins": 203.4224090576172, "rewards/rejected": -111.493408203125, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -1.724159836769104, "eval_logits/rejected": -1.6941893100738525, "eval_logps/chosen": -127.37677764892578, "eval_logps/rejected": -104.87450408935547, "eval_loss": 0.9783788323402405, "eval_rewards/accuracies": 0.91015625, "eval_rewards/chosen": 103.95471954345703, "eval_rewards/margins": 220.29249572753906, "eval_rewards/rejected": -116.33775329589844, "eval_runtime": 97.7821, "eval_samples_per_second": 20.454, "eval_steps_per_second": 0.327, "step": 100 }, { "epoch": 0.23, "grad_norm": 9309.434410308813, "learning_rate": 4.747874028753375e-07, "logits/chosen": -1.6036758422851562, "logits/rejected": -1.651767373085022, "logps/chosen": -123.1724624633789, "logps/rejected": -111.15580749511719, "loss": 1.3815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 88.74246978759766, "rewards/margins": 178.5576934814453, "rewards/rejected": -89.81523132324219, "step": 110 }, { "epoch": 0.25, "grad_norm": 7646.143632789072, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.6596767902374268, "logits/rejected": -1.639947533607483, "logps/chosen": -122.2258071899414, "logps/rejected": -112.39066314697266, "loss": 1.2948, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 106.12471008300781, "rewards/margins": 216.6038818359375, "rewards/rejected": -110.47917175292969, "step": 120 }, { "epoch": 0.27, "grad_norm": 5056.012586834783, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -1.6654844284057617, "logits/rejected": -1.6651356220245361, "logps/chosen": -129.16343688964844, "logps/rejected": -105.808837890625, "loss": 1.5047, "rewards/accuracies": 0.90625, "rewards/chosen": 115.51092529296875, "rewards/margins": 240.7315673828125, "rewards/rejected": -125.22064208984375, "step": 130 }, { "epoch": 0.29, "grad_norm": 13712.265823185711, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.5624234676361084, "logits/rejected": -1.5188586711883545, "logps/chosen": -126.886474609375, "logps/rejected": -106.10212707519531, "loss": 2.4129, "rewards/accuracies": 0.875, "rewards/chosen": 106.82698822021484, "rewards/margins": 237.18234252929688, "rewards/rejected": -130.3553924560547, "step": 140 }, { "epoch": 0.31, "grad_norm": 11133.14391408868, "learning_rate": 4.337355301007335e-07, "logits/chosen": -1.688401460647583, "logits/rejected": -1.706859827041626, "logps/chosen": -124.13720703125, "logps/rejected": -103.48664855957031, "loss": 1.5451, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 115.81596374511719, "rewards/margins": 246.532958984375, "rewards/rejected": -130.7169647216797, "step": 150 }, { "epoch": 0.33, "grad_norm": 13437.263938948909, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -1.5865412950515747, "logits/rejected": -1.5408028364181519, "logps/chosen": -131.3970489501953, "logps/rejected": -111.11344909667969, "loss": 1.9447, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 105.68785095214844, "rewards/margins": 227.367919921875, "rewards/rejected": -121.6800308227539, "step": 160 }, { "epoch": 0.36, "grad_norm": 12185.877768787304, "learning_rate": 4.070934040463998e-07, "logits/chosen": -1.8080990314483643, "logits/rejected": -1.7689082622528076, "logps/chosen": -127.97332763671875, "logps/rejected": -110.8963623046875, "loss": 2.3084, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 111.68205261230469, "rewards/margins": 263.9295349121094, "rewards/rejected": -152.2474822998047, "step": 170 }, { "epoch": 0.38, "grad_norm": 12397.612395894674, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.7653987407684326, "logits/rejected": -1.7487728595733643, "logps/chosen": -130.79266357421875, "logps/rejected": -106.92414855957031, "loss": 2.1426, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 123.60539245605469, "rewards/margins": 284.14984130859375, "rewards/rejected": -160.54443359375, "step": 180 }, { "epoch": 0.4, "grad_norm": 8233.63739133568, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -1.803746223449707, "logits/rejected": -1.7734184265136719, "logps/chosen": -120.31190490722656, "logps/rejected": -120.1562271118164, "loss": 1.8983, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 114.87654876708984, "rewards/margins": 295.48638916015625, "rewards/rejected": -180.6098175048828, "step": 190 }, { "epoch": 0.42, "grad_norm": 8854.142242075086, "learning_rate": 3.610497133404795e-07, "logits/chosen": -1.748956322669983, "logits/rejected": -1.7438066005706787, "logps/chosen": -126.25050354003906, "logps/rejected": -105.51225280761719, "loss": 2.8553, "rewards/accuracies": 0.9375, "rewards/chosen": 128.85885620117188, "rewards/margins": 303.90032958984375, "rewards/rejected": -175.04144287109375, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -1.8518784046173096, "eval_logits/rejected": -1.8462214469909668, "eval_logps/chosen": -125.94612121582031, "eval_logps/rejected": -111.28173828125, "eval_loss": 1.8849064111709595, "eval_rewards/accuracies": 0.91015625, "eval_rewards/chosen": 118.2613525390625, "eval_rewards/margins": 298.67144775390625, "eval_rewards/rejected": -180.41009521484375, "eval_runtime": 97.5342, "eval_samples_per_second": 20.506, "eval_steps_per_second": 0.328, "step": 200 }, { "epoch": 0.44, "grad_norm": 11055.40272221904, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -1.6907581090927124, "logits/rejected": -1.6238548755645752, "logps/chosen": -122.90483093261719, "logps/rejected": -103.57493591308594, "loss": 2.2063, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 96.85926818847656, "rewards/margins": 236.5902862548828, "rewards/rejected": -139.7310028076172, "step": 210 }, { "epoch": 0.46, "grad_norm": 8772.23523058132, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.832867980003357, "logits/rejected": -1.8748031854629517, "logps/chosen": -124.8982162475586, "logps/rejected": -103.1186752319336, "loss": 2.1666, "rewards/accuracies": 0.90625, "rewards/chosen": 115.77070617675781, "rewards/margins": 271.67108154296875, "rewards/rejected": -155.9003448486328, "step": 220 }, { "epoch": 0.48, "grad_norm": 7569.137122040314, "learning_rate": 3.096924887558854e-07, "logits/chosen": -1.7989298105239868, "logits/rejected": -1.737357497215271, "logps/chosen": -135.94210815429688, "logps/rejected": -119.22425842285156, "loss": 2.7122, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 117.66719818115234, "rewards/margins": 311.7781677246094, "rewards/rejected": -194.11097717285156, "step": 230 }, { "epoch": 0.5, "grad_norm": 9108.756414029493, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -1.831538438796997, "logits/rejected": -1.8362337350845337, "logps/chosen": -121.7720947265625, "logps/rejected": -112.72883605957031, "loss": 2.3272, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 101.53041076660156, "rewards/margins": 252.52392578125, "rewards/rejected": -150.99354553222656, "step": 240 }, { "epoch": 0.52, "grad_norm": 11190.763278546787, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -1.83551824092865, "logits/rejected": -1.8649381399154663, "logps/chosen": -125.136474609375, "logps/rejected": -114.23868560791016, "loss": 2.3585, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 122.786865234375, "rewards/margins": 295.3399658203125, "rewards/rejected": -172.55311584472656, "step": 250 }, { "epoch": 0.54, "grad_norm": 6158.166356558157, "learning_rate": 2.55479083351317e-07, "logits/chosen": -1.8736432790756226, "logits/rejected": -1.887500524520874, "logps/chosen": -129.7669677734375, "logps/rejected": -101.11165618896484, "loss": 2.615, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 140.60031127929688, "rewards/margins": 302.95391845703125, "rewards/rejected": -162.3535919189453, "step": 260 }, { "epoch": 0.56, "grad_norm": 5058.294410517059, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -1.8732073307037354, "logits/rejected": -1.817939043045044, "logps/chosen": -120.4271011352539, "logps/rejected": -108.13395690917969, "loss": 2.1432, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 116.31379699707031, "rewards/margins": 285.315185546875, "rewards/rejected": -169.00140380859375, "step": 270 }, { "epoch": 0.59, "grad_norm": 8591.417699065232, "learning_rate": 2.19029145890313e-07, "logits/chosen": -1.7311344146728516, "logits/rejected": -1.812242865562439, "logps/chosen": -126.07157897949219, "logps/rejected": -120.28532409667969, "loss": 3.1754, "rewards/accuracies": 0.90625, "rewards/chosen": 109.85076904296875, "rewards/margins": 299.208740234375, "rewards/rejected": -189.35797119140625, "step": 280 }, { "epoch": 0.61, "grad_norm": 10809.88836686454, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -1.8156871795654297, "logits/rejected": -1.7812505960464478, "logps/chosen": -116.09767150878906, "logps/rejected": -107.4920883178711, "loss": 2.8029, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 120.6517333984375, "rewards/margins": 304.40826416015625, "rewards/rejected": -183.7565460205078, "step": 290 }, { "epoch": 0.63, "grad_norm": 7050.7886997426285, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -1.7682945728302002, "logits/rejected": -1.7057702541351318, "logps/chosen": -134.6074676513672, "logps/rejected": -124.12396240234375, "loss": 2.2897, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 118.32698059082031, "rewards/margins": 297.16571044921875, "rewards/rejected": -178.83876037597656, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": -1.868285059928894, "eval_logits/rejected": -1.8641510009765625, "eval_logps/chosen": -124.9817886352539, "eval_logps/rejected": -112.93190002441406, "eval_loss": 2.1029016971588135, "eval_rewards/accuracies": 0.9140625, "eval_rewards/chosen": 127.90460968017578, "eval_rewards/margins": 324.8162536621094, "eval_rewards/rejected": -196.91163635253906, "eval_runtime": 97.6004, "eval_samples_per_second": 20.492, "eval_steps_per_second": 0.328, "step": 300 }, { "epoch": 0.65, "grad_norm": 11804.0120903992, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -1.7608633041381836, "logits/rejected": -1.7920173406600952, "logps/chosen": -126.25215911865234, "logps/rejected": -115.8753433227539, "loss": 2.7724, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 130.85523986816406, "rewards/margins": 304.2178039550781, "rewards/rejected": -173.3625946044922, "step": 310 }, { "epoch": 0.67, "grad_norm": 10459.35107314203, "learning_rate": 1.488723393865766e-07, "logits/chosen": -1.7917076349258423, "logits/rejected": -1.7515103816986084, "logps/chosen": -116.50152587890625, "logps/rejected": -114.2782211303711, "loss": 2.4757, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 142.95352172851562, "rewards/margins": 323.181884765625, "rewards/rejected": -180.22837829589844, "step": 320 }, { "epoch": 0.69, "grad_norm": 8336.1056366251, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -1.7024962902069092, "logits/rejected": -1.7793302536010742, "logps/chosen": -126.26700592041016, "logps/rejected": -110.26517486572266, "loss": 2.626, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 119.15933990478516, "rewards/margins": 284.33038330078125, "rewards/rejected": -165.17105102539062, "step": 330 }, { "epoch": 0.71, "grad_norm": 6521.040226026619, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -1.6726289987564087, "logits/rejected": -1.7343635559082031, "logps/chosen": -120.09101867675781, "logps/rejected": -102.69850158691406, "loss": 3.9728, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 103.992919921875, "rewards/margins": 254.6125030517578, "rewards/rejected": -150.61959838867188, "step": 340 }, { "epoch": 0.73, "grad_norm": 9653.282761957253, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -1.7162805795669556, "logits/rejected": -1.7199398279190063, "logps/chosen": -120.1608657836914, "logps/rejected": -121.0823974609375, "loss": 2.2998, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 99.24105072021484, "rewards/margins": 294.47723388671875, "rewards/rejected": -195.23617553710938, "step": 350 }, { "epoch": 0.75, "grad_norm": 8483.170816558306, "learning_rate": 8.729103716819111e-08, "logits/chosen": -1.7057559490203857, "logits/rejected": -1.7162882089614868, "logps/chosen": -126.68985748291016, "logps/rejected": -117.80401611328125, "loss": 1.8659, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 114.60487365722656, "rewards/margins": 299.46978759765625, "rewards/rejected": -184.86489868164062, "step": 360 }, { "epoch": 0.77, "grad_norm": 16292.765448877528, "learning_rate": 7.387025063449081e-08, "logits/chosen": -1.8123562335968018, "logits/rejected": -1.7508220672607422, "logps/chosen": -116.35862731933594, "logps/rejected": -107.92704010009766, "loss": 2.4835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 100.31108093261719, "rewards/margins": 254.4406280517578, "rewards/rejected": -154.12954711914062, "step": 370 }, { "epoch": 0.79, "grad_norm": 12994.314264805473, "learning_rate": 6.138919252022435e-08, "logits/chosen": -1.84799063205719, "logits/rejected": -1.857129693031311, "logps/chosen": -124.62117004394531, "logps/rejected": -118.90538024902344, "loss": 2.1362, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 120.3254623413086, "rewards/margins": 317.2894592285156, "rewards/rejected": -196.96397399902344, "step": 380 }, { "epoch": 0.82, "grad_norm": 11506.166124614792, "learning_rate": 4.991445467064689e-08, "logits/chosen": -1.7219364643096924, "logits/rejected": -1.7155206203460693, "logps/chosen": -119.84449768066406, "logps/rejected": -109.42668151855469, "loss": 2.056, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 126.5132827758789, "rewards/margins": 317.73858642578125, "rewards/rejected": -191.22531127929688, "step": 390 }, { "epoch": 0.84, "grad_norm": 14299.186863928613, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -1.7028141021728516, "logits/rejected": -1.7706788778305054, "logps/chosen": -122.94708251953125, "logps/rejected": -110.153564453125, "loss": 2.2714, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 98.14391326904297, "rewards/margins": 283.8785400390625, "rewards/rejected": -185.73460388183594, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": -1.8724907636642456, "eval_logits/rejected": -1.8708370923995972, "eval_logps/chosen": -125.25865936279297, "eval_logps/rejected": -113.72941589355469, "eval_loss": 1.8651787042617798, "eval_rewards/accuracies": 0.9140625, "eval_rewards/chosen": 125.13589477539062, "eval_rewards/margins": 330.022705078125, "eval_rewards/rejected": -204.8868408203125, "eval_runtime": 97.6945, "eval_samples_per_second": 20.472, "eval_steps_per_second": 0.328, "step": 400 }, { "epoch": 0.86, "grad_norm": 9838.615615925528, "learning_rate": 3.022313472693447e-08, "logits/chosen": -1.7946465015411377, "logits/rejected": -1.8112504482269287, "logps/chosen": -134.29222106933594, "logps/rejected": -112.4157943725586, "loss": 2.3885, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 130.2279815673828, "rewards/margins": 333.3272705078125, "rewards/rejected": -203.0992889404297, "step": 410 }, { "epoch": 0.88, "grad_norm": 9593.489304491115, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -1.7704830169677734, "logits/rejected": -1.758772611618042, "logps/chosen": -118.04278564453125, "logps/rejected": -109.47645568847656, "loss": 1.872, "rewards/accuracies": 0.90625, "rewards/chosen": 102.66886138916016, "rewards/margins": 290.7878112792969, "rewards/rejected": -188.11898803710938, "step": 420 }, { "epoch": 0.9, "grad_norm": 6763.365413342396, "learning_rate": 1.521597710086439e-08, "logits/chosen": -1.710999846458435, "logits/rejected": -1.7392107248306274, "logps/chosen": -134.11866760253906, "logps/rejected": -109.66754150390625, "loss": 2.0497, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 119.90589904785156, "rewards/margins": 281.7261657714844, "rewards/rejected": -161.82025146484375, "step": 430 }, { "epoch": 0.92, "grad_norm": 8451.951191648946, "learning_rate": 9.57301420397924e-09, "logits/chosen": -1.852447748184204, "logits/rejected": -1.8499510288238525, "logps/chosen": -125.65571594238281, "logps/rejected": -113.3338394165039, "loss": 2.2476, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 133.1741485595703, "rewards/margins": 332.48126220703125, "rewards/rejected": -199.3070831298828, "step": 440 }, { "epoch": 0.94, "grad_norm": 9580.867576787692, "learning_rate": 5.212833302556258e-09, "logits/chosen": -1.8617397546768188, "logits/rejected": -1.8191306591033936, "logps/chosen": -121.5869369506836, "logps/rejected": -108.55745697021484, "loss": 2.5789, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 137.1997528076172, "rewards/margins": 339.0559387207031, "rewards/rejected": -201.85618591308594, "step": 450 }, { "epoch": 0.96, "grad_norm": 4611.3001759975, "learning_rate": 2.158697848236607e-09, "logits/chosen": -1.7538158893585205, "logits/rejected": -1.7651903629302979, "logps/chosen": -126.25445556640625, "logps/rejected": -113.3001937866211, "loss": 1.6224, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 130.99368286132812, "rewards/margins": 307.7095642089844, "rewards/rejected": -176.7158660888672, "step": 460 }, { "epoch": 0.98, "grad_norm": 8182.843310129901, "learning_rate": 4.269029751107489e-10, "logits/chosen": -1.728009581565857, "logits/rejected": -1.7379541397094727, "logps/chosen": -119.18067932128906, "logps/rejected": -124.09730529785156, "loss": 2.184, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 105.3095703125, "rewards/margins": 308.53289794921875, "rewards/rejected": -203.22329711914062, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 2.0128297995323914, "train_runtime": 7588.5519, "train_samples_per_second": 8.056, "train_steps_per_second": 0.063 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }