{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 800000000, "global_step": 383, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.6875, "learning_rate": 1.282051282051282e-07, "logits/chosen": -3.3797154426574707, "logits/rejected": -3.440782070159912, "logps/chosen": -244.57943725585938, "logps/rejected": -168.14312744140625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/diff": -0.5416666865348816, "rewards/diff_abs": 0.5416666865348816, "rewards/rejected": 0.0, "rewards/student_margin": 0.0, "rewards/teacher_margin": 0.5416666865348816, "step": 1 }, { "epoch": 0.03, "grad_norm": 1.7421875, "learning_rate": 1.282051282051282e-06, "logits/chosen": -3.3581883907318115, "logits/rejected": -3.306663990020752, "logps/chosen": -323.6011657714844, "logps/rejected": -269.5755615234375, "loss": 0.6946, "rewards/accuracies": 0.48148155212402344, "rewards/chosen": -0.002131123561412096, "rewards/diff": -2.200160026550293, "rewards/diff_abs": 2.200160026550293, "rewards/rejected": -0.0022028146777302027, "rewards/student_margin": 7.169279706431553e-05, "rewards/teacher_margin": 2.2002317905426025, "step": 10 }, { "epoch": 0.05, "grad_norm": 1.6875, "learning_rate": 2.564102564102564e-06, "logits/chosen": -3.5238196849823, "logits/rejected": -3.590470552444458, "logps/chosen": -277.163818359375, "logps/rejected": -192.54022216796875, "loss": 0.6932, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": -0.003379967762157321, "rewards/diff": -2.1003687381744385, "rewards/diff_abs": 2.1011030673980713, "rewards/rejected": -0.007698670029640198, "rewards/student_margin": 0.004318701568990946, "rewards/teacher_margin": 2.1046876907348633, "step": 20 }, { "epoch": 0.08, "grad_norm": 1.6875, "learning_rate": 3.846153846153847e-06, "logits/chosen": -3.4147961139678955, "logits/rejected": -3.542013168334961, "logps/chosen": -301.5926208496094, "logps/rejected": -231.64608764648438, "loss": 0.6909, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.017984800040721893, "rewards/diff": -2.444106101989746, "rewards/diff_abs": 2.444106101989746, "rewards/rejected": 0.006361725740134716, "rewards/student_margin": 0.011623072437942028, "rewards/teacher_margin": 2.4557292461395264, "step": 30 }, { "epoch": 0.1, "grad_norm": 1.640625, "learning_rate": 4.99989574668946e-06, "logits/chosen": -3.4892425537109375, "logits/rejected": -3.546952486038208, "logps/chosen": -249.1132049560547, "logps/rejected": -179.81195068359375, "loss": 0.6874, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": 0.002312846016138792, "rewards/diff": -2.8649048805236816, "rewards/diff_abs": 2.8649048805236816, "rewards/rejected": 0.019301289692521095, "rewards/student_margin": -0.016988443210721016, "rewards/teacher_margin": 2.847916841506958, "step": 40 }, { "epoch": 0.13, "grad_norm": 1.609375, "learning_rate": 4.987395866955716e-06, "logits/chosen": -3.387133836746216, "logits/rejected": -3.533109664916992, "logps/chosen": -331.1688232421875, "logps/rejected": -186.29550170898438, "loss": 0.6808, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.03838258236646652, "rewards/diff": -1.9975881576538086, "rewards/diff_abs": 2.001307487487793, "rewards/rejected": -0.008821181952953339, "rewards/student_margin": 0.04720376059412956, "rewards/teacher_margin": 2.0447916984558105, "step": 50 }, { "epoch": 0.16, "grad_norm": 1.6875, "learning_rate": 4.954164717534748e-06, "logits/chosen": -3.346745729446411, "logits/rejected": -3.3796913623809814, "logps/chosen": -327.2835388183594, "logps/rejected": -350.4725036621094, "loss": 0.679, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": 0.01055043376982212, "rewards/diff": -1.2435307502746582, "rewards/diff_abs": 1.2447913885116577, "rewards/rejected": 0.049393653869628906, "rewards/student_margin": -0.038843221962451935, "rewards/teacher_margin": 1.2046875953674316, "step": 60 }, { "epoch": 0.18, "grad_norm": 1.6171875, "learning_rate": 4.900479264361017e-06, "logits/chosen": -3.3996708393096924, "logits/rejected": -3.4439964294433594, "logps/chosen": -308.00958251953125, "logps/rejected": -278.58026123046875, "loss": 0.6708, "rewards/accuracies": 0.5666666626930237, "rewards/chosen": 0.0636972039937973, "rewards/diff": -1.3036770820617676, "rewards/diff_abs": 1.306718349456787, "rewards/rejected": 0.04393672198057175, "rewards/student_margin": 0.019760485738515854, "rewards/teacher_margin": 1.3234374523162842, "step": 70 }, { "epoch": 0.21, "grad_norm": 1.6328125, "learning_rate": 4.826786950329646e-06, "logits/chosen": -3.520059108734131, "logits/rejected": -3.576214551925659, "logps/chosen": -283.0509338378906, "logps/rejected": -180.75180053710938, "loss": 0.6653, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 0.07656367868185043, "rewards/diff": -1.3558346033096313, "rewards/diff_abs": 1.3813632726669312, "rewards/rejected": 0.01052325963973999, "rewards/student_margin": 0.06604041904211044, "rewards/teacher_margin": 1.421875, "step": 80 }, { "epoch": 0.23, "grad_norm": 1.6640625, "learning_rate": 4.733701966071226e-06, "logits/chosen": -3.4589409828186035, "logits/rejected": -3.511751890182495, "logps/chosen": -335.0478820800781, "logps/rejected": -170.95372009277344, "loss": 0.665, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.02129988744854927, "rewards/diff": -2.89522647857666, "rewards/diff_abs": 2.9056954383850098, "rewards/rejected": 0.027463769540190697, "rewards/student_margin": -0.006163885351270437, "rewards/teacher_margin": 2.8890626430511475, "step": 90 }, { "epoch": 0.26, "grad_norm": 1.59375, "learning_rate": 4.622000130963015e-06, "logits/chosen": -3.4993369579315186, "logits/rejected": -3.5635037422180176, "logps/chosen": -305.15899658203125, "logps/rejected": -202.56192016601562, "loss": 0.6583, "rewards/accuracies": 0.5, "rewards/chosen": 0.018651207908988, "rewards/diff": -2.507476329803467, "rewards/diff_abs": 2.507476329803467, "rewards/rejected": -0.02908078208565712, "rewards/student_margin": 0.04773198813199997, "rewards/teacher_margin": 2.555208444595337, "step": 100 }, { "epoch": 0.29, "grad_norm": 1.5546875, "learning_rate": 4.492612427040864e-06, "logits/chosen": -3.5523293018341064, "logits/rejected": -3.6302642822265625, "logps/chosen": -277.225830078125, "logps/rejected": -200.26490783691406, "loss": 0.6502, "rewards/accuracies": 0.5999999642372131, "rewards/chosen": 0.024044061079621315, "rewards/diff": -1.571934461593628, "rewards/diff_abs": 1.571934461593628, "rewards/rejected": -0.08318804949522018, "rewards/student_margin": 0.10723210871219635, "rewards/teacher_margin": 1.6791667938232422, "step": 110 }, { "epoch": 0.31, "grad_norm": 1.578125, "learning_rate": 4.346617239703676e-06, "logits/chosen": -3.480700969696045, "logits/rejected": -3.604377031326294, "logps/chosen": -304.3082580566406, "logps/rejected": -239.17782592773438, "loss": 0.6465, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.10159333795309067, "rewards/diff": -1.6648391485214233, "rewards/diff_abs": 1.7264082431793213, "rewards/rejected": 0.0065366788767278194, "rewards/student_margin": 0.09505666792392731, "rewards/teacher_margin": 1.7598956823349, "step": 120 }, { "epoch": 0.34, "grad_norm": 1.6171875, "learning_rate": 4.185231369880461e-06, "logits/chosen": -3.216306209564209, "logits/rejected": -3.4205565452575684, "logps/chosen": -324.16461181640625, "logps/rejected": -221.4116668701172, "loss": 0.6427, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.020539376884698868, "rewards/diff": -2.5135083198547363, "rewards/diff_abs": 2.52120041847229, "rewards/rejected": -0.060743771493434906, "rewards/student_margin": 0.08128315210342407, "rewards/teacher_margin": 2.594791889190674, "step": 130 }, { "epoch": 0.37, "grad_norm": 1.640625, "learning_rate": 4.009799892569317e-06, "logits/chosen": -3.4796624183654785, "logits/rejected": -3.4889540672302246, "logps/chosen": -294.43646240234375, "logps/rejected": -235.06918334960938, "loss": 0.637, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.09381090849637985, "rewards/diff": -1.9453909397125244, "rewards/diff_abs": 1.9707868099212646, "rewards/rejected": -0.13163167238235474, "rewards/student_margin": 0.225442573428154, "rewards/teacher_margin": 2.1708333492279053, "step": 140 }, { "epoch": 0.39, "grad_norm": 1.546875, "learning_rate": 3.8217849462726334e-06, "logits/chosen": -3.6116485595703125, "logits/rejected": -3.564044237136841, "logps/chosen": -246.2872314453125, "logps/rejected": -221.53182983398438, "loss": 0.6364, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": -0.0010233506327494979, "rewards/diff": -1.8935142755508423, "rewards/diff_abs": 1.8935142755508423, "rewards/rejected": -0.08146756142377853, "rewards/student_margin": 0.08044421672821045, "rewards/teacher_margin": 1.9739586114883423, "step": 150 }, { "epoch": 0.42, "grad_norm": 1.609375, "learning_rate": 3.6227535467632873e-06, "logits/chosen": -3.4925827980041504, "logits/rejected": -3.6650619506835938, "logps/chosen": -441.7289123535156, "logps/rejected": -258.9549255371094, "loss": 0.6285, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09688085317611694, "rewards/diff": -1.887935996055603, "rewards/diff_abs": 1.9241327047348022, "rewards/rejected": -0.09226653724908829, "rewards/student_margin": 0.18914742767810822, "rewards/teacher_margin": 2.0770833492279053, "step": 160 }, { "epoch": 0.44, "grad_norm": 1.609375, "learning_rate": 3.4143645267483144e-06, "logits/chosen": -3.485863208770752, "logits/rejected": -3.5399489402770996, "logps/chosen": -317.8275146484375, "logps/rejected": -262.54498291015625, "loss": 0.6253, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009405359625816345, "rewards/diff": -2.4229490756988525, "rewards/diff_abs": 2.4852664470672607, "rewards/rejected": -0.1302066296339035, "rewards/student_margin": 0.12080129235982895, "rewards/teacher_margin": 2.543750047683716, "step": 170 }, { "epoch": 0.47, "grad_norm": 1.5546875, "learning_rate": 3.1983547102818104e-06, "logits/chosen": -3.4576945304870605, "logits/rejected": -3.5367603302001953, "logps/chosen": -356.4649658203125, "logps/rejected": -292.81439208984375, "loss": 0.6201, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.02529655024409294, "rewards/diff": -1.6506948471069336, "rewards/diff_abs": 1.7114416360855103, "rewards/rejected": -0.43606019020080566, "rewards/student_margin": 0.410763680934906, "rewards/teacher_margin": 2.0614585876464844, "step": 180 }, { "epoch": 0.5, "grad_norm": 1.5625, "learning_rate": 2.9765244371567873e-06, "logits/chosen": -3.4763588905334473, "logits/rejected": -3.562711715698242, "logps/chosen": -280.12835693359375, "logps/rejected": -208.5796661376953, "loss": 0.6191, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.045582111924886703, "rewards/diff": -2.346590280532837, "rewards/diff_abs": 2.3965134620666504, "rewards/rejected": -0.23230692744255066, "rewards/student_margin": 0.27788907289505005, "rewards/teacher_margin": 2.624479055404663, "step": 190 }, { "epoch": 0.52, "grad_norm": 1.6484375, "learning_rate": 2.7507225579233487e-06, "logits/chosen": -3.7000794410705566, "logits/rejected": -3.884822368621826, "logps/chosen": -268.5820617675781, "logps/rejected": -196.71481323242188, "loss": 0.6147, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.0688471719622612, "rewards/diff": -2.06192946434021, "rewards/diff_abs": 2.06192946434021, "rewards/rejected": -0.0957857146859169, "rewards/student_margin": 0.1646328866481781, "rewards/teacher_margin": 2.2265625, "step": 200 }, { "epoch": 0.55, "grad_norm": 1.734375, "learning_rate": 2.522831024592615e-06, "logits/chosen": -3.5710883140563965, "logits/rejected": -3.746605634689331, "logps/chosen": -306.7405700683594, "logps/rejected": -241.92324829101562, "loss": 0.6188, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.010941224172711372, "rewards/diff": -2.189682722091675, "rewards/diff_abs": 2.2137255668640137, "rewards/rejected": -0.17959186434745789, "rewards/student_margin": 0.16865065693855286, "rewards/teacher_margin": 2.3583333492279053, "step": 210 }, { "epoch": 0.57, "grad_norm": 1.5859375, "learning_rate": 2.2947492054556075e-06, "logits/chosen": -3.5517051219940186, "logits/rejected": -3.7514452934265137, "logps/chosen": -323.0804138183594, "logps/rejected": -211.4063262939453, "loss": 0.6077, "rewards/accuracies": 0.6333333849906921, "rewards/chosen": -0.08047564327716827, "rewards/diff": -1.5008246898651123, "rewards/diff_abs": 1.5533117055892944, "rewards/rejected": -0.24319279193878174, "rewards/student_margin": 0.16271713376045227, "rewards/teacher_margin": 1.6635417938232422, "step": 220 }, { "epoch": 0.6, "grad_norm": 1.59375, "learning_rate": 2.0683780547456666e-06, "logits/chosen": -3.480419635772705, "logits/rejected": -3.667999744415283, "logps/chosen": -314.51739501953125, "logps/rejected": -293.828125, "loss": 0.6222, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.07014875113964081, "rewards/diff": -1.6531155109405518, "rewards/diff_abs": 1.6864144802093506, "rewards/rejected": -0.21058997511863708, "rewards/student_margin": 0.2807387411594391, "rewards/teacher_margin": 1.933854341506958, "step": 230 }, { "epoch": 0.63, "grad_norm": 1.546875, "learning_rate": 1.845604269082787e-06, "logits/chosen": -3.5570831298828125, "logits/rejected": -3.799448013305664, "logps/chosen": -326.3376159667969, "logps/rejected": -229.31527709960938, "loss": 0.6077, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007120040711015463, "rewards/diff": -2.2296009063720703, "rewards/diff_abs": 2.2296009063720703, "rewards/rejected": -0.10494570434093475, "rewards/student_margin": 0.11206575483083725, "rewards/teacher_margin": 2.3416669368743896, "step": 240 }, { "epoch": 0.65, "grad_norm": 1.578125, "learning_rate": 1.628284562748429e-06, "logits/chosen": -3.538252592086792, "logits/rejected": -3.9445133209228516, "logps/chosen": -453.56585693359375, "logps/rejected": -191.9573516845703, "loss": 0.6036, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.1685246229171753, "rewards/diff": -2.3100411891937256, "rewards/diff_abs": 2.372875690460205, "rewards/rejected": -0.21674680709838867, "rewards/student_margin": 0.38527145981788635, "rewards/teacher_margin": 2.695312738418579, "step": 250 }, { "epoch": 0.68, "grad_norm": 1.5703125, "learning_rate": 1.4182301928489556e-06, "logits/chosen": -3.6895079612731934, "logits/rejected": -3.920624256134033, "logps/chosen": -319.21429443359375, "logps/rejected": -181.65463256835938, "loss": 0.6049, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": 0.05849825218319893, "rewards/diff": -2.5624961853027344, "rewards/diff_abs": 2.587552547454834, "rewards/rejected": -0.21494324505329132, "rewards/student_margin": 0.27344149351119995, "rewards/teacher_margin": 2.835937738418579, "step": 260 }, { "epoch": 0.7, "grad_norm": 1.5078125, "learning_rate": 1.2171918633431623e-06, "logits/chosen": -3.5318374633789062, "logits/rejected": -3.4252419471740723, "logps/chosen": -341.723388671875, "logps/rejected": -344.2121887207031, "loss": 0.6112, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.0180380679666996, "rewards/diff": -1.8148130178451538, "rewards/diff_abs": 1.9221045970916748, "rewards/rejected": -0.31676679849624634, "rewards/student_margin": 0.2987287640571594, "rewards/teacher_margin": 2.113541603088379, "step": 270 }, { "epoch": 0.73, "grad_norm": 1.546875, "learning_rate": 1.0268451337516774e-06, "logits/chosen": -3.6931426525115967, "logits/rejected": -3.896336078643799, "logps/chosen": -308.007080078125, "logps/rejected": -167.8982696533203, "loss": 0.6079, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.07022675126791, "rewards/diff": -2.198145627975464, "rewards/diff_abs": 2.226590394973755, "rewards/rejected": -0.31322699785232544, "rewards/student_margin": 0.24300022423267365, "rewards/teacher_margin": 2.441145896911621, "step": 280 }, { "epoch": 0.76, "grad_norm": 1.5, "learning_rate": 8.487764541597765e-07, "logits/chosen": -3.497781753540039, "logits/rejected": -3.8565516471862793, "logps/chosen": -267.2030944824219, "logps/rejected": -168.0758819580078, "loss": 0.6053, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -0.04164644330739975, "rewards/diff": -2.201241970062256, "rewards/diff_abs": 2.2781121730804443, "rewards/rejected": -0.2831125855445862, "rewards/student_margin": 0.24146613478660583, "rewards/teacher_margin": 2.4427084922790527, "step": 290 }, { "epoch": 0.78, "grad_norm": 1.5625, "learning_rate": 6.844699429052377e-07, "logits/chosen": -3.385387897491455, "logits/rejected": -3.6356453895568848, "logps/chosen": -409.85205078125, "logps/rejected": -312.1107177734375, "loss": 0.6067, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05188798904418945, "rewards/diff": -1.7707669734954834, "rewards/diff_abs": 1.8406012058258057, "rewards/rejected": -0.28216272592544556, "rewards/student_margin": 0.2302747219800949, "rewards/teacher_margin": 2.001041889190674, "step": 300 }, { "epoch": 0.81, "grad_norm": 1.5703125, "learning_rate": 5.352950171529928e-07, "logits/chosen": -3.574982166290283, "logits/rejected": -3.6512961387634277, "logps/chosen": -244.470947265625, "logps/rejected": -193.90447998046875, "loss": 0.6061, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.07677438855171204, "rewards/diff": -2.3986549377441406, "rewards/diff_abs": 2.4272804260253906, "rewards/rejected": -0.37082797288894653, "rewards/student_margin": 0.2940535545349121, "rewards/teacher_margin": 2.6927084922790527, "step": 310 }, { "epoch": 0.84, "grad_norm": 1.5703125, "learning_rate": 4.024949794498623e-07, "logits/chosen": -3.5370934009552, "logits/rejected": -3.8875668048858643, "logps/chosen": -254.0207977294922, "logps/rejected": -177.51011657714844, "loss": 0.6019, "rewards/accuracies": 0.73333340883255, "rewards/chosen": -0.05724753811955452, "rewards/diff": -2.3326706886291504, "rewards/diff_abs": 2.4501850605010986, "rewards/rejected": -0.37457695603370667, "rewards/student_margin": 0.31732940673828125, "rewards/teacher_margin": 2.6500000953674316, "step": 320 }, { "epoch": 0.86, "grad_norm": 1.6171875, "learning_rate": 2.8717665538507965e-07, "logits/chosen": -3.554170608520508, "logits/rejected": -3.562087297439575, "logps/chosen": -279.2240905761719, "logps/rejected": -198.07553100585938, "loss": 0.6066, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.03678184002637863, "rewards/diff": -1.2939647436141968, "rewards/diff_abs": 1.5477110147476196, "rewards/rejected": -0.4365670084953308, "rewards/student_margin": 0.39978522062301636, "rewards/teacher_margin": 1.693750023841858, "step": 330 }, { "epoch": 0.89, "grad_norm": 1.546875, "learning_rate": 1.9030116872178317e-07, "logits/chosen": -3.729553699493408, "logits/rejected": -3.6963329315185547, "logps/chosen": -294.4330139160156, "logps/rejected": -217.95785522460938, "loss": 0.6061, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.03005790151655674, "rewards/diff": -1.652592420578003, "rewards/diff_abs": 1.681958794593811, "rewards/rejected": -0.3183913826942444, "rewards/student_margin": 0.34844931960105896, "rewards/teacher_margin": 2.001041889190674, "step": 340 }, { "epoch": 0.91, "grad_norm": 1.5234375, "learning_rate": 1.1267593088441886e-07, "logits/chosen": -3.5918819904327393, "logits/rejected": -3.487471103668213, "logps/chosen": -301.32330322265625, "logps/rejected": -268.36737060546875, "loss": 0.6086, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": -0.02564469538629055, "rewards/diff": -1.5005762577056885, "rewards/diff_abs": 1.6122653484344482, "rewards/rejected": -0.3542352318763733, "rewards/student_margin": 0.3285905420780182, "rewards/teacher_margin": 1.8291666507720947, "step": 350 }, { "epoch": 0.94, "grad_norm": 1.59375, "learning_rate": 5.494791156587686e-08, "logits/chosen": -3.7036406993865967, "logits/rejected": -3.693377733230591, "logps/chosen": -232.45254516601562, "logps/rejected": -235.75521850585938, "loss": 0.6033, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": -0.03294830024242401, "rewards/diff": -1.6850173473358154, "rewards/diff_abs": 1.7215397357940674, "rewards/rejected": -0.19897261261940002, "rewards/student_margin": 0.166024312376976, "rewards/teacher_margin": 1.851041555404663, "step": 360 }, { "epoch": 0.97, "grad_norm": 1.5625, "learning_rate": 1.7598246540683483e-08, "logits/chosen": -3.8011093139648438, "logits/rejected": -3.7970664501190186, "logps/chosen": -246.9715118408203, "logps/rejected": -200.6898956298828, "loss": 0.609, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -0.005081920884549618, "rewards/diff": -1.5501182079315186, "rewards/diff_abs": 1.625860571861267, "rewards/rejected": -0.3940262198448181, "rewards/student_margin": 0.3889443278312683, "rewards/teacher_margin": 1.9390627145767212, "step": 370 }, { "epoch": 0.99, "grad_norm": 1.5625, "learning_rate": 9.382276255742729e-10, "logits/chosen": -3.510223388671875, "logits/rejected": -3.580479145050049, "logps/chosen": -379.2341003417969, "logps/rejected": -310.8593444824219, "loss": 0.6077, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.037930965423583984, "rewards/diff": -1.598463535308838, "rewards/diff_abs": 1.7268564701080322, "rewards/rejected": -0.2682930827140808, "rewards/student_margin": 0.3062240481376648, "rewards/teacher_margin": 1.904687523841858, "step": 380 }, { "epoch": 1.0, "step": 383, "total_flos": 0.0, "train_loss": 0.6328112833182432, "train_runtime": 3006.7888, "train_samples_per_second": 48.89, "train_steps_per_second": 0.127 } ], "logging_steps": 10, "max_steps": 383, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000000000000000000000000000000, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }