{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.59375, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.1666858196258545, "logits/rejected": -2.182244300842285, "logps/chosen": -12.368609428405762, "logps/rejected": -24.687644958496094, "loss": 0.6931, "pred_label": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "use_label": 10.0 }, { "epoch": 0.02, "grad_norm": 0.6796875, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.2281553745269775, "logits/rejected": -2.276446580886841, "logps/chosen": -57.036190032958984, "logps/rejected": -66.88007354736328, "loss": 0.6927, "pred_label": 0.0, "rewards/accuracies": 0.24013157188892365, "rewards/chosen": 0.003924594726413488, "rewards/margins": 0.0009102027979679406, "rewards/rejected": 0.0030143915209919214, "step": 20, "use_label": 170.0 }, { "epoch": 0.04, "grad_norm": 0.6328125, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.2738099098205566, "logits/rejected": -2.2623789310455322, "logps/chosen": -54.78137969970703, "logps/rejected": -67.2437515258789, "loss": 0.6914, "pred_label": 0.0, "rewards/accuracies": 0.24687500298023224, "rewards/chosen": 0.01747792772948742, "rewards/margins": 0.001674558618105948, "rewards/rejected": 0.015803368762135506, "step": 40, "use_label": 482.0 }, { "epoch": 0.06, "grad_norm": 0.71875, "learning_rate": 3.125e-06, "logits/chosen": -2.3237431049346924, "logits/rejected": -2.321906089782715, "logps/chosen": -75.5770034790039, "logps/rejected": -87.68544006347656, "loss": 0.6885, "pred_label": 0.0, "rewards/accuracies": 0.3125, "rewards/chosen": 0.031676117330789566, "rewards/margins": 0.009719676338136196, "rewards/rejected": 0.021956440061330795, "step": 60, "use_label": 802.0 }, { "epoch": 0.08, "grad_norm": 0.73828125, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.2948005199432373, "logits/rejected": -2.2623462677001953, "logps/chosen": -79.29240417480469, "logps/rejected": -83.04844665527344, "loss": 0.6876, "pred_label": 5.800000190734863, "rewards/accuracies": 0.3343749940395355, "rewards/chosen": 0.016009245067834854, "rewards/margins": 0.018887853249907494, "rewards/rejected": -0.0028786074835807085, "step": 80, "use_label": 1116.199951171875 }, { "epoch": 0.1, "grad_norm": 0.6953125, "learning_rate": 4.9997324926814375e-06, "logits/chosen": -2.2056884765625, "logits/rejected": -2.210036039352417, "logps/chosen": -68.87937927246094, "logps/rejected": -77.87590026855469, "loss": 0.6876, "pred_label": 27.537500381469727, "rewards/accuracies": 0.34062498807907104, "rewards/chosen": -0.010471501387655735, "rewards/margins": 0.03584115579724312, "rewards/rejected": -0.04631265625357628, "step": 100, "use_label": 1414.4625244140625 }, { "epoch": 0.1, "eval_logits/chosen": -2.1076083183288574, "eval_logits/rejected": -2.0761499404907227, "eval_logps/chosen": -74.44951629638672, "eval_logps/rejected": -85.2883071899414, "eval_loss": 0.6895647048950195, "eval_pred_label": 89.14286041259766, "eval_rewards/accuracies": 0.335317462682724, "eval_rewards/chosen": -0.05548960343003273, "eval_rewards/margins": 0.04341282695531845, "eval_rewards/rejected": -0.09890241920948029, "eval_runtime": 247.5952, "eval_samples_per_second": 8.078, "eval_steps_per_second": 0.254, "eval_use_label": 1766.857177734375, "step": 100 }, { "epoch": 0.13, "grad_norm": 0.7578125, "learning_rate": 4.9903757462135984e-06, "logits/chosen": -2.2542896270751953, "logits/rejected": -2.1902401447296143, "logps/chosen": -70.2941665649414, "logps/rejected": -84.7874755859375, "loss": 0.6884, "pred_label": 155.6374969482422, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.023759985342621803, "rewards/margins": 0.051492441445589066, "rewards/rejected": -0.07525241374969482, "step": 120, "use_label": 2110.362548828125 }, { "epoch": 0.15, "grad_norm": 0.55859375, "learning_rate": 4.967700826904229e-06, "logits/chosen": -2.1823272705078125, "logits/rejected": -2.210157632827759, "logps/chosen": -61.80498504638672, "logps/rejected": -76.43424224853516, "loss": 0.6907, "pred_label": 204.22500610351562, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.029314354062080383, "rewards/margins": 0.036702848970890045, "rewards/rejected": -0.06601719558238983, "step": 140, "use_label": 2381.77490234375 }, { "epoch": 0.17, "grad_norm": 0.70703125, "learning_rate": 4.931828996974498e-06, "logits/chosen": -2.251568555831909, "logits/rejected": -2.220432996749878, "logps/chosen": -66.60148620605469, "logps/rejected": -71.53702545166016, "loss": 0.69, "pred_label": 257.2124938964844, "rewards/accuracies": 0.3343749940395355, "rewards/chosen": -0.020524730905890465, "rewards/margins": 0.05932433158159256, "rewards/rejected": -0.07984906435012817, "step": 160, "use_label": 2648.78759765625 }, { "epoch": 0.19, "grad_norm": 0.6796875, "learning_rate": 4.882952093833628e-06, "logits/chosen": -2.114015817642212, "logits/rejected": -2.126950740814209, "logps/chosen": -66.40071868896484, "logps/rejected": -78.54503631591797, "loss": 0.6901, "pred_label": 319.9624938964844, "rewards/accuracies": 0.328125, "rewards/chosen": -0.03171534463763237, "rewards/margins": 0.0544399619102478, "rewards/rejected": -0.08615531027317047, "step": 180, "use_label": 2906.03759765625 }, { "epoch": 0.21, "grad_norm": 0.9140625, "learning_rate": 4.821331504159906e-06, "logits/chosen": -2.138213872909546, "logits/rejected": -2.108750343322754, "logps/chosen": -77.92289733886719, "logps/rejected": -78.32075500488281, "loss": 0.6892, "pred_label": 383.5249938964844, "rewards/accuracies": 0.37812501192092896, "rewards/chosen": -0.009543296881020069, "rewards/margins": 0.06037301942706108, "rewards/rejected": -0.06991632282733917, "step": 200, "use_label": 3162.47509765625 }, { "epoch": 0.21, "eval_logits/chosen": -2.051973581314087, "eval_logits/rejected": -2.028658390045166, "eval_logps/chosen": -69.3875503540039, "eval_logps/rejected": -80.99542999267578, "eval_loss": 0.6893584132194519, "eval_pred_label": 459.1111145019531, "eval_rewards/accuracies": 0.3492063581943512, "eval_rewards/chosen": -0.0048699695616960526, "eval_rewards/margins": 0.05110359564423561, "eval_rewards/rejected": -0.05597356706857681, "eval_runtime": 247.8689, "eval_samples_per_second": 8.069, "eval_steps_per_second": 0.254, "eval_use_label": 3500.888916015625, "step": 200 }, { "epoch": 0.23, "grad_norm": 0.765625, "learning_rate": 4.747296766042161e-06, "logits/chosen": -2.172316074371338, "logits/rejected": -2.1599390506744385, "logps/chosen": -73.75865173339844, "logps/rejected": -76.45826721191406, "loss": 0.6906, "pred_label": 537.4000244140625, "rewards/accuracies": 0.34375, "rewards/chosen": -0.017265746369957924, "rewards/margins": 0.061459798365831375, "rewards/rejected": -0.07872554659843445, "step": 220, "use_label": 3832.60009765625 }, { "epoch": 0.25, "grad_norm": 0.671875, "learning_rate": 4.661243806657256e-06, "logits/chosen": -2.1377243995666504, "logits/rejected": -2.114131450653076, "logps/chosen": -78.08522033691406, "logps/rejected": -88.16291809082031, "loss": 0.6906, "pred_label": 610.8624877929688, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.06858871877193451, "rewards/margins": 0.07855252921581268, "rewards/rejected": -0.1471412628889084, "step": 240, "use_label": 4079.137451171875 }, { "epoch": 0.27, "grad_norm": 0.70703125, "learning_rate": 4.563632824908252e-06, "logits/chosen": -2.1762757301330566, "logits/rejected": -2.173243999481201, "logps/chosen": -69.33678436279297, "logps/rejected": -82.98787689208984, "loss": 0.6907, "pred_label": 682.2750244140625, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.06302420794963837, "rewards/margins": 0.0732887014746666, "rewards/rejected": -0.13631291687488556, "step": 260, "use_label": 4327.72509765625 }, { "epoch": 0.29, "grad_norm": 0.625, "learning_rate": 4.454985830346574e-06, "logits/chosen": -2.16465425491333, "logits/rejected": -2.1788923740386963, "logps/chosen": -74.41441345214844, "logps/rejected": -78.55416870117188, "loss": 0.6892, "pred_label": 749.125, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": -0.06083650514483452, "rewards/margins": 0.04520425945520401, "rewards/rejected": -0.10604077577590942, "step": 280, "use_label": 4580.875 }, { "epoch": 0.31, "grad_norm": 0.65234375, "learning_rate": 4.335883851539693e-06, "logits/chosen": -2.0553781986236572, "logits/rejected": -2.0573229789733887, "logps/chosen": -69.96788024902344, "logps/rejected": -80.52223205566406, "loss": 0.6904, "pred_label": 824.5499877929688, "rewards/accuracies": 0.359375, "rewards/chosen": -0.04866168648004532, "rewards/margins": 0.09801270812749863, "rewards/rejected": -0.14667439460754395, "step": 300, "use_label": 4825.4501953125 }, { "epoch": 0.31, "eval_logits/chosen": -2.0163989067077637, "eval_logits/rejected": -1.9942671060562134, "eval_logps/chosen": -75.15243530273438, "eval_logps/rejected": -89.50163269042969, "eval_loss": 0.6908969879150391, "eval_pred_label": 923.3174438476562, "eval_rewards/accuracies": 0.3531745970249176, "eval_rewards/chosen": -0.06251893937587738, "eval_rewards/margins": 0.07851671427488327, "eval_rewards/rejected": -0.14103564620018005, "eval_runtime": 247.8241, "eval_samples_per_second": 8.07, "eval_steps_per_second": 0.254, "eval_use_label": 5140.6826171875, "step": 300 }, { "epoch": 0.33, "grad_norm": 0.9140625, "learning_rate": 4.206963828813555e-06, "logits/chosen": -2.065279483795166, "logits/rejected": -2.0684821605682373, "logps/chosen": -72.58639526367188, "logps/rejected": -89.45655822753906, "loss": 0.6899, "pred_label": 1033.7874755859375, "rewards/accuracies": 0.3125, "rewards/chosen": -0.11120834201574326, "rewards/margins": 0.0645986869931221, "rewards/rejected": -0.17580702900886536, "step": 320, "use_label": 5440.21240234375 }, { "epoch": 0.36, "grad_norm": 0.56640625, "learning_rate": 4.068915207986931e-06, "logits/chosen": -2.033398151397705, "logits/rejected": -1.991502046585083, "logps/chosen": -71.1894760131836, "logps/rejected": -84.0774154663086, "loss": 0.6917, "pred_label": 1122.112548828125, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.07950185984373093, "rewards/margins": 0.08617939054965973, "rewards/rejected": -0.16568127274513245, "step": 340, "use_label": 5671.8876953125 }, { "epoch": 0.38, "grad_norm": 0.84765625, "learning_rate": 3.922476253313921e-06, "logits/chosen": -2.0358688831329346, "logits/rejected": -2.0224781036376953, "logps/chosen": -76.57051849365234, "logps/rejected": -84.2589340209961, "loss": 0.6914, "pred_label": 1204.4124755859375, "rewards/accuracies": 0.31562501192092896, "rewards/chosen": -0.11715561151504517, "rewards/margins": 0.07723374664783478, "rewards/rejected": -0.19438934326171875, "step": 360, "use_label": 5909.58740234375 }, { "epoch": 0.4, "grad_norm": 0.55078125, "learning_rate": 3.768430099352445e-06, "logits/chosen": -2.12782621383667, "logits/rejected": -2.086026430130005, "logps/chosen": -74.41622161865234, "logps/rejected": -85.17180633544922, "loss": 0.6918, "pred_label": 1289.9375, "rewards/accuracies": 0.3656249940395355, "rewards/chosen": -0.07592298835515976, "rewards/margins": 0.08457346260547638, "rewards/rejected": -0.16049645841121674, "step": 380, "use_label": 6144.0625 }, { "epoch": 0.42, "grad_norm": 0.73046875, "learning_rate": 3.607600562872785e-06, "logits/chosen": -2.126784086227417, "logits/rejected": -2.1261298656463623, "logps/chosen": -83.82131958007812, "logps/rejected": -86.00455474853516, "loss": 0.6906, "pred_label": 1373.137451171875, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.05874443054199219, "rewards/margins": 0.06775099784135818, "rewards/rejected": -0.12649545073509216, "step": 400, "use_label": 6380.8623046875 }, { "epoch": 0.42, "eval_logits/chosen": -2.0480618476867676, "eval_logits/rejected": -2.0248324871063232, "eval_logps/chosen": -75.26866149902344, "eval_logps/rejected": -90.80635070800781, "eval_loss": 0.6920759081840515, "eval_pred_label": 1472.5714111328125, "eval_rewards/accuracies": 0.3511904776096344, "eval_rewards/chosen": -0.06368114054203033, "eval_rewards/margins": 0.09040173143148422, "eval_rewards/rejected": -0.15408287942409515, "eval_runtime": 248.0088, "eval_samples_per_second": 8.064, "eval_steps_per_second": 0.254, "eval_use_label": 6695.4287109375, "step": 400 }, { "epoch": 0.44, "grad_norm": 0.78515625, "learning_rate": 3.4408477372034743e-06, "logits/chosen": -2.055358409881592, "logits/rejected": -2.068175792694092, "logps/chosen": -70.47552490234375, "logps/rejected": -79.02010345458984, "loss": 0.6903, "pred_label": 1589.0374755859375, "rewards/accuracies": 0.3656249940395355, "rewards/chosen": -0.06399895995855331, "rewards/margins": 0.0963120311498642, "rewards/rejected": -0.16031098365783691, "step": 420, "use_label": 6988.96240234375 }, { "epoch": 0.46, "grad_norm": 0.95703125, "learning_rate": 3.269063392575352e-06, "logits/chosen": -2.0893940925598145, "logits/rejected": -2.09212589263916, "logps/chosen": -85.68560028076172, "logps/rejected": -87.41291809082031, "loss": 0.6912, "pred_label": 1667.6875, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.13728377223014832, "rewards/margins": 0.07875251770019531, "rewards/rejected": -0.21603628993034363, "step": 440, "use_label": 7230.3125 }, { "epoch": 0.48, "grad_norm": 0.53515625, "learning_rate": 3.09316620706208e-06, "logits/chosen": -2.079465389251709, "logits/rejected": -2.091001033782959, "logps/chosen": -73.67254638671875, "logps/rejected": -81.05415344238281, "loss": 0.6916, "pred_label": 1751.75, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0876312330365181, "rewards/margins": 0.08376732468605042, "rewards/rejected": -0.17139855027198792, "step": 460, "use_label": 7466.25 }, { "epoch": 0.5, "grad_norm": 0.69921875, "learning_rate": 2.91409685362137e-06, "logits/chosen": -2.0379364490509033, "logits/rejected": -2.0492634773254395, "logps/chosen": -77.06828308105469, "logps/rejected": -89.38865661621094, "loss": 0.6912, "pred_label": 1832.6500244140625, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.06041146069765091, "rewards/margins": 0.10216375440359116, "rewards/rejected": -0.16257521510124207, "step": 480, "use_label": 7705.35009765625 }, { "epoch": 0.52, "grad_norm": 0.86328125, "learning_rate": 2.7328129695107205e-06, "logits/chosen": -2.031346082687378, "logits/rejected": -2.0272762775421143, "logps/chosen": -79.55888366699219, "logps/rejected": -84.47586822509766, "loss": 0.6903, "pred_label": 1919.5374755859375, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.08177755773067474, "rewards/margins": 0.08017835766077042, "rewards/rejected": -0.16195592284202576, "step": 500, "use_label": 7938.46240234375 }, { "epoch": 0.52, "eval_logits/chosen": -2.0070507526397705, "eval_logits/rejected": -1.9800992012023926, "eval_logps/chosen": -76.36968231201172, "eval_logps/rejected": -92.65614318847656, "eval_loss": 0.6914148926734924, "eval_pred_label": 2025.793701171875, "eval_rewards/accuracies": 0.3492063581943512, "eval_rewards/chosen": -0.07469133287668228, "eval_rewards/margins": 0.09788943827152252, "eval_rewards/rejected": -0.1725807636976242, "eval_runtime": 247.8554, "eval_samples_per_second": 8.069, "eval_steps_per_second": 0.254, "eval_use_label": 8246.2060546875, "step": 500 }, { "epoch": 0.54, "grad_norm": 0.78125, "learning_rate": 2.5502840349805074e-06, "logits/chosen": -2.026449203491211, "logits/rejected": -2.0701510906219482, "logps/chosen": -75.1209487915039, "logps/rejected": -88.01356506347656, "loss": 0.6913, "pred_label": 2148.887451171875, "rewards/accuracies": 0.3531250059604645, "rewards/chosen": -0.06801941990852356, "rewards/margins": 0.09691040217876434, "rewards/rejected": -0.1649298369884491, "step": 520, "use_label": 8533.1123046875 }, { "epoch": 0.57, "grad_norm": 1.09375, "learning_rate": 2.367486188632446e-06, "logits/chosen": -2.0245327949523926, "logits/rejected": -2.0479135513305664, "logps/chosen": -84.60169219970703, "logps/rejected": -90.6330795288086, "loss": 0.692, "pred_label": 2235.550048828125, "rewards/accuracies": 0.359375, "rewards/chosen": -0.09091995656490326, "rewards/margins": 0.11123095452785492, "rewards/rejected": -0.20215091109275818, "step": 540, "use_label": 8766.4501953125 }, { "epoch": 0.59, "grad_norm": 0.75390625, "learning_rate": 2.1853970071701415e-06, "logits/chosen": -2.0177600383758545, "logits/rejected": -2.016798257827759, "logps/chosen": -78.94650268554688, "logps/rejected": -80.36412811279297, "loss": 0.6917, "pred_label": 2319.53759765625, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": -0.10138510167598724, "rewards/margins": 0.06911652535200119, "rewards/rejected": -0.17050163447856903, "step": 560, "use_label": 9002.462890625 }, { "epoch": 0.61, "grad_norm": 0.71875, "learning_rate": 2.00499027745888e-06, "logits/chosen": -2.054065704345703, "logits/rejected": -2.0555384159088135, "logps/chosen": -80.3529281616211, "logps/rejected": -95.12947082519531, "loss": 0.6919, "pred_label": 2401.675048828125, "rewards/accuracies": 0.359375, "rewards/chosen": -0.09597108513116837, "rewards/margins": 0.09131233394145966, "rewards/rejected": -0.18728342652320862, "step": 580, "use_label": 9240.3251953125 }, { "epoch": 0.63, "grad_norm": 0.76171875, "learning_rate": 1.8272307888529276e-06, "logits/chosen": -2.059126377105713, "logits/rejected": -2.099806547164917, "logps/chosen": -89.58797454833984, "logps/rejected": -108.6166000366211, "loss": 0.6903, "pred_label": 2492.9375, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.12580521404743195, "rewards/margins": 0.10241512209177017, "rewards/rejected": -0.22822031378746033, "step": 600, "use_label": 9469.0625 }, { "epoch": 0.63, "eval_logits/chosen": -1.9870026111602783, "eval_logits/rejected": -1.960112452507019, "eval_logps/chosen": -78.95431518554688, "eval_logps/rejected": -95.86695861816406, "eval_loss": 0.6917396187782288, "eval_pred_label": 2603.9365234375, "eval_rewards/accuracies": 0.3551587164402008, "eval_rewards/chosen": -0.1005377396941185, "eval_rewards/margins": 0.104151152074337, "eval_rewards/rejected": -0.2046888917684555, "eval_runtime": 247.9642, "eval_samples_per_second": 8.066, "eval_steps_per_second": 0.254, "eval_use_label": 9772.0634765625, "step": 600 }, { "epoch": 0.65, "grad_norm": 0.5859375, "learning_rate": 1.6530691736402317e-06, "logits/chosen": -1.9752880334854126, "logits/rejected": -2.011981964111328, "logps/chosen": -69.71615600585938, "logps/rejected": -95.88337707519531, "loss": 0.6918, "pred_label": 2726.324951171875, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.09408678859472275, "rewards/margins": 0.09362435340881348, "rewards/rejected": -0.18771114945411682, "step": 620, "use_label": 10059.6748046875 }, { "epoch": 0.67, "grad_norm": 0.73046875, "learning_rate": 1.4834368231970922e-06, "logits/chosen": -2.0288071632385254, "logits/rejected": -2.0409998893737793, "logps/chosen": -82.56907653808594, "logps/rejected": -90.75765228271484, "loss": 0.6894, "pred_label": 2805.512451171875, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.10210500657558441, "rewards/margins": 0.10695278644561768, "rewards/rejected": -0.2090577781200409, "step": 640, "use_label": 10300.4873046875 }, { "epoch": 0.69, "grad_norm": 0.5625, "learning_rate": 1.3192409070404582e-06, "logits/chosen": -2.055405855178833, "logits/rejected": -2.0071816444396973, "logps/chosen": -77.25361633300781, "logps/rejected": -88.34065246582031, "loss": 0.6915, "pred_label": 2899.9375, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.11595650017261505, "rewards/margins": 0.0952102541923523, "rewards/rejected": -0.21116676926612854, "step": 660, "use_label": 10526.0625 }, { "epoch": 0.71, "grad_norm": 0.67578125, "learning_rate": 1.1613595214152713e-06, "logits/chosen": -2.056795597076416, "logits/rejected": -2.071035861968994, "logps/chosen": -88.15283203125, "logps/rejected": -96.39839172363281, "loss": 0.6918, "pred_label": 2978.0625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.12273094803094864, "rewards/margins": 0.09404005855321884, "rewards/rejected": -0.2167709767818451, "step": 680, "use_label": 10767.9375 }, { "epoch": 0.73, "grad_norm": 0.74609375, "learning_rate": 1.0106369933615043e-06, "logits/chosen": -2.0782313346862793, "logits/rejected": -2.0467371940612793, "logps/chosen": -97.93621826171875, "logps/rejected": -106.91497802734375, "loss": 0.6917, "pred_label": 3075.71240234375, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.1391007900238037, "rewards/margins": 0.10766571760177612, "rewards/rejected": -0.24676652252674103, "step": 700, "use_label": 10990.287109375 }, { "epoch": 0.73, "eval_logits/chosen": -1.9658821821212769, "eval_logits/rejected": -1.9401167631149292, "eval_logps/chosen": -80.06806182861328, "eval_logps/rejected": -97.64107513427734, "eval_loss": 0.6917343735694885, "eval_pred_label": 3195.22216796875, "eval_rewards/accuracies": 0.3511904776096344, "eval_rewards/chosen": -0.11167524009943008, "eval_rewards/margins": 0.1107548326253891, "eval_rewards/rejected": -0.2224300652742386, "eval_runtime": 247.943, "eval_samples_per_second": 8.066, "eval_steps_per_second": 0.254, "eval_use_label": 11284.77734375, "step": 700 }, { "epoch": 0.75, "grad_norm": 0.72265625, "learning_rate": 8.678793653740633e-07, "logits/chosen": -2.015249729156494, "logits/rejected": -2.0358498096466064, "logps/chosen": -70.9017562866211, "logps/rejected": -86.4397201538086, "loss": 0.6908, "pred_label": 3306.39990234375, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": -0.10931293666362762, "rewards/margins": 0.0925455391407013, "rewards/rejected": -0.20185847580432892, "step": 720, "use_label": 11583.599609375 }, { "epoch": 0.77, "grad_norm": 0.83203125, "learning_rate": 7.338500848029603e-07, "logits/chosen": -2.01334810256958, "logits/rejected": -2.0296788215637207, "logps/chosen": -74.19635772705078, "logps/rejected": -83.99024200439453, "loss": 0.6911, "pred_label": 3386.16259765625, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.08706559240818024, "rewards/margins": 0.11473299562931061, "rewards/rejected": -0.20179858803749084, "step": 740, "use_label": 11823.837890625 }, { "epoch": 0.8, "grad_norm": 0.66015625, "learning_rate": 6.092659210462232e-07, "logits/chosen": -2.052433967590332, "logits/rejected": -2.060997724533081, "logps/chosen": -76.93110656738281, "logps/rejected": -97.30107879638672, "loss": 0.6904, "pred_label": 3466.5, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.11182014644145966, "rewards/margins": 0.07981495559215546, "rewards/rejected": -0.1916351020336151, "step": 760, "use_label": 12063.5 }, { "epoch": 0.82, "grad_norm": 0.859375, "learning_rate": 4.947931323697983e-07, "logits/chosen": -2.032320737838745, "logits/rejected": -2.047227144241333, "logps/chosen": -89.46810913085938, "logps/rejected": -95.58660125732422, "loss": 0.6913, "pred_label": 3558.875, "rewards/accuracies": 0.375, "rewards/chosen": -0.11294672638177872, "rewards/margins": 0.11753211170434952, "rewards/rejected": -0.23047883808612823, "step": 780, "use_label": 12291.125 }, { "epoch": 0.84, "grad_norm": 0.74609375, "learning_rate": 3.910439028537638e-07, "logits/chosen": -2.010045289993286, "logits/rejected": -1.989505410194397, "logps/chosen": -70.47514343261719, "logps/rejected": -75.11082458496094, "loss": 0.6912, "pred_label": 3649.22509765625, "rewards/accuracies": 0.3656249940395355, "rewards/chosen": -0.08034199476242065, "rewards/margins": 0.0995674580335617, "rewards/rejected": -0.17990948259830475, "step": 800, "use_label": 12520.775390625 }, { "epoch": 0.84, "eval_logits/chosen": -1.9421576261520386, "eval_logits/rejected": -1.9144233465194702, "eval_logps/chosen": -77.5874252319336, "eval_logps/rejected": -95.20885467529297, "eval_loss": 0.6917100548744202, "eval_pred_label": 3757.174560546875, "eval_rewards/accuracies": 0.363095223903656, "eval_rewards/chosen": -0.08686873316764832, "eval_rewards/margins": 0.11123905330896378, "eval_rewards/rejected": -0.19810780882835388, "eval_runtime": 247.8932, "eval_samples_per_second": 8.068, "eval_steps_per_second": 0.254, "eval_use_label": 12826.8251953125, "step": 800 }, { "epoch": 0.86, "grad_norm": 0.828125, "learning_rate": 2.98573068519539e-07, "logits/chosen": -2.035728931427002, "logits/rejected": -2.029679775238037, "logps/chosen": -74.97032165527344, "logps/rejected": -84.2763900756836, "loss": 0.6908, "pred_label": 3872.199951171875, "rewards/accuracies": 0.3343749940395355, "rewards/chosen": -0.1004786491394043, "rewards/margins": 0.08142165094614029, "rewards/rejected": -0.181900292634964, "step": 820, "use_label": 13121.7998046875 }, { "epoch": 0.88, "grad_norm": 0.6953125, "learning_rate": 2.178751501463036e-07, "logits/chosen": -2.0276803970336914, "logits/rejected": -2.0149848461151123, "logps/chosen": -66.70552062988281, "logps/rejected": -70.63726806640625, "loss": 0.6915, "pred_label": 3954.60009765625, "rewards/accuracies": 0.28437501192092896, "rewards/chosen": -0.08035041391849518, "rewards/margins": 0.07462439686059952, "rewards/rejected": -0.1549748182296753, "step": 840, "use_label": 13359.400390625 }, { "epoch": 0.9, "grad_norm": 0.7578125, "learning_rate": 1.4938170864468636e-07, "logits/chosen": -2.048083543777466, "logits/rejected": -2.0321922302246094, "logps/chosen": -90.8042221069336, "logps/rejected": -100.8233413696289, "loss": 0.69, "pred_label": 4041.72509765625, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0809466689825058, "rewards/margins": 0.1332779824733734, "rewards/rejected": -0.2142246663570404, "step": 860, "use_label": 13592.275390625 }, { "epoch": 0.92, "grad_norm": 0.5546875, "learning_rate": 9.345903713082305e-08, "logits/chosen": -2.047487735748291, "logits/rejected": -2.034466505050659, "logps/chosen": -81.69231414794922, "logps/rejected": -101.5263442993164, "loss": 0.6915, "pred_label": 4142.625, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -0.09660721570253372, "rewards/margins": 0.13364934921264648, "rewards/rejected": -0.23025652766227722, "step": 880, "use_label": 13811.375 }, { "epoch": 0.94, "grad_norm": 0.7578125, "learning_rate": 5.0406202043228604e-08, "logits/chosen": -1.9304163455963135, "logits/rejected": -1.9657026529312134, "logps/chosen": -75.30284118652344, "logps/rejected": -99.71704864501953, "loss": 0.6914, "pred_label": 4235.9248046875, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.08683101832866669, "rewards/margins": 0.10066400468349457, "rewards/rejected": -0.18749502301216125, "step": 900, "use_label": 14038.0751953125 }, { "epoch": 0.94, "eval_logits/chosen": -1.939072847366333, "eval_logits/rejected": -1.9112603664398193, "eval_logps/chosen": -77.5274658203125, "eval_logps/rejected": -95.22908020019531, "eval_loss": 0.6917905211448669, "eval_pred_label": 4352.28564453125, "eval_rewards/accuracies": 0.3571428656578064, "eval_rewards/chosen": -0.08626923710107803, "eval_rewards/margins": 0.1120409369468689, "eval_rewards/rejected": -0.19831016659736633, "eval_runtime": 247.7794, "eval_samples_per_second": 8.072, "eval_steps_per_second": 0.254, "eval_use_label": 14335.7138671875, "step": 900 }, { "epoch": 0.96, "grad_norm": 0.80078125, "learning_rate": 2.0453443778310766e-08, "logits/chosen": -1.9801095724105835, "logits/rejected": -1.9714418649673462, "logps/chosen": -63.8930778503418, "logps/rejected": -85.15528869628906, "loss": 0.6906, "pred_label": 4473.8125, "rewards/accuracies": 0.31562501192092896, "rewards/chosen": -0.06585933268070221, "rewards/margins": 0.11039040982723236, "rewards/rejected": -0.17624975740909576, "step": 920, "use_label": 14624.1875 }, { "epoch": 0.98, "grad_norm": 0.8359375, "learning_rate": 3.760945397705828e-09, "logits/chosen": -1.9589160680770874, "logits/rejected": -1.9971154928207397, "logps/chosen": -74.0462646484375, "logps/rejected": -91.64708709716797, "loss": 0.6913, "pred_label": 4558.71240234375, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0799408107995987, "rewards/margins": 0.10116855055093765, "rewards/rejected": -0.18110935389995575, "step": 940, "use_label": 14859.287109375 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.6906769273168754, "train_runtime": 20027.4031, "train_samples_per_second": 3.053, "train_steps_per_second": 0.048 } ], "logging_steps": 20, "max_steps": 955, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }