{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998766954377312, "eval_steps": 100, "global_step": 405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.2195121951219512e-08, "logits/chosen": -2.8681135177612305, "logits/rejected": -2.8858838081359863, "logps/chosen": -518.1907958984375, "logps/rejected": -109.31971740722656, "loss": 0.3475, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.219512195121951e-07, "logits/chosen": -2.7986178398132324, "logits/rejected": -2.752176284790039, "logps/chosen": -434.208251953125, "logps/rejected": -114.19618225097656, "loss": 0.3394, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.0002649651141837239, "rewards/margins": 0.0009347840095870197, "rewards/rejected": -0.0006698188371956348, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.439024390243902e-07, "logits/chosen": -2.8215415477752686, "logits/rejected": -2.7983882427215576, "logps/chosen": -417.2633361816406, "logps/rejected": -118.0062026977539, "loss": 0.3373, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.019945567473769188, "rewards/margins": 0.03575458750128746, "rewards/rejected": -0.015809018164873123, "step": 20 }, { "epoch": 0.07, "learning_rate": 3.6585365853658536e-07, "logits/chosen": -2.6574862003326416, "logits/rejected": -2.6451315879821777, "logps/chosen": -398.87353515625, "logps/rejected": -125.69970703125, "loss": 0.3045, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.07569055259227753, "rewards/margins": 0.19884702563285828, "rewards/rejected": -0.12315647304058075, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.878048780487804e-07, "logits/chosen": -2.54256272315979, "logits/rejected": -2.5281729698181152, "logps/chosen": -384.5321044921875, "logps/rejected": -168.55758666992188, "loss": 0.2564, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.024631643667817116, "rewards/margins": 0.41851943731307983, "rewards/rejected": -0.39388787746429443, "step": 40 }, { "epoch": 0.12, "learning_rate": 4.992461696250783e-07, "logits/chosen": -2.4257261753082275, "logits/rejected": -2.3928446769714355, "logps/chosen": -436.45330810546875, "logps/rejected": -219.0617218017578, "loss": 0.1809, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1671580970287323, "rewards/margins": 0.7879143953323364, "rewards/rejected": -0.9550724029541016, "step": 50 }, { "epoch": 0.15, "learning_rate": 4.966461721767899e-07, "logits/chosen": -2.3805835247039795, "logits/rejected": -2.3364853858947754, "logps/chosen": -437.4466857910156, "logps/rejected": -240.6685791015625, "loss": 0.1377, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.32454290986061096, "rewards/margins": 0.9316140413284302, "rewards/rejected": -1.2561569213867188, "step": 60 }, { "epoch": 0.17, "learning_rate": 4.922100518015975e-07, "logits/chosen": -2.3752458095550537, "logits/rejected": -2.3281030654907227, "logps/chosen": -419.6747131347656, "logps/rejected": -264.75787353515625, "loss": 0.103, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.3937300443649292, "rewards/margins": 1.1917842626571655, "rewards/rejected": -1.5855143070220947, "step": 70 }, { "epoch": 0.2, "learning_rate": 4.859708325770919e-07, "logits/chosen": -2.4320530891418457, "logits/rejected": -2.3738579750061035, "logps/chosen": -472.10479736328125, "logps/rejected": -330.32403564453125, "loss": 0.0674, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.5765678286552429, "rewards/margins": 1.5421369075775146, "rewards/rejected": -2.1187047958374023, "step": 80 }, { "epoch": 0.22, "learning_rate": 4.779749614980225e-07, "logits/chosen": -2.3991949558258057, "logits/rejected": -2.357053279876709, "logps/chosen": -487.83074951171875, "logps/rejected": -349.1925354003906, "loss": 0.0553, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.666537880897522, "rewards/margins": 1.7182201147079468, "rewards/rejected": -2.3847577571868896, "step": 90 }, { "epoch": 0.25, "learning_rate": 4.682819627081427e-07, "logits/chosen": -2.3752927780151367, "logits/rejected": -2.326216220855713, "logps/chosen": -515.1549682617188, "logps/rejected": -378.8877868652344, "loss": 0.0437, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6667075157165527, "rewards/margins": 2.000246524810791, "rewards/rejected": -2.666954278945923, "step": 100 }, { "epoch": 0.25, "eval_logits/chosen": -2.3259778022766113, "eval_logits/rejected": -2.314302682876587, "eval_logps/chosen": -482.4153747558594, "eval_logps/rejected": -504.759033203125, "eval_loss": 0.08243285864591599, "eval_rewards/accuracies": 0.5859375, "eval_rewards/chosen": -2.2537574768066406, "eval_rewards/margins": 0.22029951214790344, "eval_rewards/rejected": -2.4740567207336426, "eval_runtime": 53.3582, "eval_samples_per_second": 37.483, "eval_steps_per_second": 0.6, "step": 100 }, { "epoch": 0.27, "learning_rate": 4.569639943810477e-07, "logits/chosen": -2.3144371509552, "logits/rejected": -2.2340025901794434, "logps/chosen": -490.12921142578125, "logps/rejected": -419.07867431640625, "loss": 0.0359, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3260681629180908, "rewards/margins": 1.7610738277435303, "rewards/rejected": -3.0871422290802, "step": 110 }, { "epoch": 0.3, "learning_rate": 4.4410531154874543e-07, "logits/chosen": -2.3716444969177246, "logits/rejected": -2.3235533237457275, "logps/chosen": -466.01702880859375, "logps/rejected": -356.8735046386719, "loss": 0.0575, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7180399298667908, "rewards/margins": 1.6505486965179443, "rewards/rejected": -2.368588924407959, "step": 120 }, { "epoch": 0.32, "learning_rate": 4.298016388768561e-07, "logits/chosen": -2.3074584007263184, "logits/rejected": -2.257930040359497, "logps/chosen": -472.1845703125, "logps/rejected": -373.66522216796875, "loss": 0.0498, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0088322162628174, "rewards/margins": 1.6445964574813843, "rewards/rejected": -2.653428792953491, "step": 130 }, { "epoch": 0.35, "learning_rate": 4.1415945805573005e-07, "logits/chosen": -2.225804328918457, "logits/rejected": -2.15400767326355, "logps/chosen": -534.1700439453125, "logps/rejected": -430.3104553222656, "loss": 0.0361, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.042690634727478, "rewards/margins": 2.092653751373291, "rewards/rejected": -3.1353445053100586, "step": 140 }, { "epoch": 0.37, "learning_rate": 3.972952151123984e-07, "logits/chosen": -2.2562787532806396, "logits/rejected": -2.164506673812866, "logps/chosen": -522.7659912109375, "logps/rejected": -425.18109130859375, "loss": 0.0344, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1226718425750732, "rewards/margins": 2.0036892890930176, "rewards/rejected": -3.12636137008667, "step": 150 }, { "epoch": 0.39, "learning_rate": 3.793344535444142e-07, "logits/chosen": -2.267565965652466, "logits/rejected": -2.1969974040985107, "logps/chosen": -530.3189086914062, "logps/rejected": -426.72332763671875, "loss": 0.0393, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2129985094070435, "rewards/margins": 1.8231168985366821, "rewards/rejected": -3.0361156463623047, "step": 160 }, { "epoch": 0.42, "learning_rate": 3.604108797288461e-07, "logits/chosen": -2.237342119216919, "logits/rejected": -2.1961898803710938, "logps/chosen": -448.13812255859375, "logps/rejected": -372.9068298339844, "loss": 0.0465, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.092370629310608, "rewards/margins": 1.5985119342803955, "rewards/rejected": -2.690882444381714, "step": 170 }, { "epoch": 0.44, "learning_rate": 3.40665367563858e-07, "logits/chosen": -2.2571911811828613, "logits/rejected": -2.140353202819824, "logps/chosen": -548.1529541015625, "logps/rejected": -449.4532165527344, "loss": 0.035, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8807584643363953, "rewards/margins": 2.355053663253784, "rewards/rejected": -3.2358124256134033, "step": 180 }, { "epoch": 0.47, "learning_rate": 3.202449097526798e-07, "logits/chosen": -2.1954236030578613, "logits/rejected": -2.113832950592041, "logps/chosen": -545.7277221679688, "logps/rejected": -466.76580810546875, "loss": 0.029, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.4056795835494995, "rewards/margins": 2.1022555828094482, "rewards/rejected": -3.5079357624053955, "step": 190 }, { "epoch": 0.49, "learning_rate": 2.993015235369905e-07, "logits/chosen": -2.1386027336120605, "logits/rejected": -2.0572166442871094, "logps/chosen": -560.2534790039062, "logps/rejected": -491.8816833496094, "loss": 0.0258, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.810624361038208, "rewards/margins": 1.9691530466079712, "rewards/rejected": -3.7797775268554688, "step": 200 }, { "epoch": 0.49, "eval_logits/chosen": -2.1394448280334473, "eval_logits/rejected": -2.1155476570129395, "eval_logps/chosen": -543.8071899414062, "eval_logps/rejected": -579.2755126953125, "eval_loss": 0.0581156425178051, "eval_rewards/accuracies": 0.59765625, "eval_rewards/chosen": -2.86767578125, "eval_rewards/margins": 0.35154610872268677, "eval_rewards/rejected": -3.219222068786621, "eval_runtime": 53.2701, "eval_samples_per_second": 37.545, "eval_steps_per_second": 0.601, "step": 200 }, { "epoch": 0.52, "learning_rate": 2.7799111902582693e-07, "logits/chosen": -2.1782305240631104, "logits/rejected": -2.044674873352051, "logps/chosen": -579.908935546875, "logps/rejected": -500.6641540527344, "loss": 0.0219, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.408044695854187, "rewards/margins": 2.4992563724517822, "rewards/rejected": -3.9073009490966797, "step": 210 }, { "epoch": 0.54, "learning_rate": 2.564723385445869e-07, "logits/chosen": -2.2589755058288574, "logits/rejected": -2.156228542327881, "logps/chosen": -563.1976318359375, "logps/rejected": -475.75030517578125, "loss": 0.038, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3078866004943848, "rewards/margins": 2.1681323051452637, "rewards/rejected": -3.4760184288024902, "step": 220 }, { "epoch": 0.57, "learning_rate": 2.3490537564442845e-07, "logits/chosen": -2.2288191318511963, "logits/rejected": -2.136579751968384, "logps/chosen": -507.54632568359375, "logps/rejected": -419.88470458984375, "loss": 0.0432, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2000774145126343, "rewards/margins": 1.7510545253753662, "rewards/rejected": -2.951131820678711, "step": 230 }, { "epoch": 0.59, "learning_rate": 2.1345078256378801e-07, "logits/chosen": -2.282217264175415, "logits/rejected": -2.1927459239959717, "logps/chosen": -539.92822265625, "logps/rejected": -433.8241271972656, "loss": 0.0373, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2784963846206665, "rewards/margins": 1.8950881958007812, "rewards/rejected": -3.1735846996307373, "step": 240 }, { "epoch": 0.62, "learning_rate": 1.9226827501969865e-07, "logits/chosen": -2.2803092002868652, "logits/rejected": -2.1990160942077637, "logps/chosen": -537.9136962890625, "logps/rejected": -442.28350830078125, "loss": 0.04, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2320274114608765, "rewards/margins": 2.1027939319610596, "rewards/rejected": -3.3348212242126465, "step": 250 }, { "epoch": 0.64, "learning_rate": 1.715155432264775e-07, "logits/chosen": -2.2646355628967285, "logits/rejected": -2.14613676071167, "logps/chosen": -502.49664306640625, "logps/rejected": -420.11004638671875, "loss": 0.0396, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1264328956604004, "rewards/margins": 2.006878614425659, "rewards/rejected": -3.1333117485046387, "step": 260 }, { "epoch": 0.67, "learning_rate": 1.51347077992983e-07, "logits/chosen": -2.3088698387145996, "logits/rejected": -2.2018628120422363, "logps/chosen": -554.0256958007812, "logps/rejected": -421.2101135253906, "loss": 0.0375, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.10258948802948, "rewards/margins": 1.9626919031143188, "rewards/rejected": -3.065281391143799, "step": 270 }, { "epoch": 0.69, "learning_rate": 1.3191302063739906e-07, "logits/chosen": -2.310133457183838, "logits/rejected": -2.216827392578125, "logps/chosen": -522.3606567382812, "logps/rejected": -438.058349609375, "loss": 0.043, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1809624433517456, "rewards/margins": 1.9291051626205444, "rewards/rejected": -3.110067844390869, "step": 280 }, { "epoch": 0.72, "learning_rate": 1.1335804528119475e-07, "logits/chosen": -2.3108785152435303, "logits/rejected": -2.2141172885894775, "logps/chosen": -544.7510986328125, "logps/rejected": -427.60150146484375, "loss": 0.044, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.00501549243927, "rewards/margins": 2.146829605102539, "rewards/rejected": -3.1518452167510986, "step": 290 }, { "epoch": 0.74, "learning_rate": 9.582028184286423e-08, "logits/chosen": -2.350487470626831, "logits/rejected": -2.307096481323242, "logps/chosen": -554.42529296875, "logps/rejected": -470.14434814453125, "loss": 0.0402, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.1662867069244385, "rewards/margins": 2.156501531600952, "rewards/rejected": -3.3227882385253906, "step": 300 }, { "epoch": 0.74, "eval_logits/chosen": -2.2979543209075928, "eval_logits/rejected": -2.2751243114471436, "eval_logps/chosen": -467.0057067871094, "eval_logps/rejected": -507.4114685058594, "eval_loss": 0.08367828279733658, "eval_rewards/accuracies": 0.62890625, "eval_rewards/chosen": -2.099660634994507, "eval_rewards/margins": 0.4009218215942383, "eval_rewards/rejected": -2.500582218170166, "eval_runtime": 53.3734, "eval_samples_per_second": 37.472, "eval_steps_per_second": 0.6, "step": 300 }, { "epoch": 0.76, "learning_rate": 7.943028774907065e-08, "logits/chosen": -2.316253185272217, "logits/rejected": -2.209606170654297, "logps/chosen": -524.6145629882812, "logps/rejected": -420.94671630859375, "loss": 0.0324, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0568145513534546, "rewards/margins": 2.0644707679748535, "rewards/rejected": -3.1212852001190186, "step": 310 }, { "epoch": 0.79, "learning_rate": 6.431007601814637e-08, "logits/chosen": -2.2733869552612305, "logits/rejected": -2.169506549835205, "logps/chosen": -532.5906982421875, "logps/rejected": -450.932373046875, "loss": 0.0316, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5221502780914307, "rewards/margins": 1.9199508428573608, "rewards/rejected": -3.442101001739502, "step": 320 }, { "epoch": 0.81, "learning_rate": 5.0572206951246e-08, "logits/chosen": -2.290539503097534, "logits/rejected": -2.193920850753784, "logps/chosen": -562.043701171875, "logps/rejected": -479.5208435058594, "loss": 0.0278, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3767458200454712, "rewards/margins": 2.1372973918914795, "rewards/rejected": -3.514043092727661, "step": 330 }, { "epoch": 0.84, "learning_rate": 3.831895019292897e-08, "logits/chosen": -2.3263535499572754, "logits/rejected": -2.207899570465088, "logps/chosen": -619.2625122070312, "logps/rejected": -520.6148071289062, "loss": 0.0305, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5666420459747314, "rewards/margins": 2.300938844680786, "rewards/rejected": -3.8675804138183594, "step": 340 }, { "epoch": 0.86, "learning_rate": 2.764152339909756e-08, "logits/chosen": -2.305875539779663, "logits/rejected": -2.1924188137054443, "logps/chosen": -568.1319580078125, "logps/rejected": -475.6539611816406, "loss": 0.0245, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.341552972793579, "rewards/margins": 2.3198726177215576, "rewards/rejected": -3.661425828933716, "step": 350 }, { "epoch": 0.89, "learning_rate": 1.861941317991664e-08, "logits/chosen": -2.31453800201416, "logits/rejected": -2.209552049636841, "logps/chosen": -574.0198974609375, "logps/rejected": -498.809326171875, "loss": 0.0246, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.3858083486557007, "rewards/margins": 2.422987699508667, "rewards/rejected": -3.80879545211792, "step": 360 }, { "epoch": 0.91, "learning_rate": 1.13197833728636e-08, "logits/chosen": -2.2876980304718018, "logits/rejected": -2.1881823539733887, "logps/chosen": -583.4609985351562, "logps/rejected": -515.4216918945312, "loss": 0.0274, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2713569402694702, "rewards/margins": 2.548645496368408, "rewards/rejected": -3.820002317428589, "step": 370 }, { "epoch": 0.94, "learning_rate": 5.79697505093521e-09, "logits/chosen": -2.2938995361328125, "logits/rejected": -2.161371946334839, "logps/chosen": -567.2229614257812, "logps/rejected": -493.6429138183594, "loss": 0.0339, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.495012879371643, "rewards/margins": 2.173337697982788, "rewards/rejected": -3.6683506965637207, "step": 380 }, { "epoch": 0.96, "learning_rate": 2.092101988131256e-09, "logits/chosen": -2.3137059211730957, "logits/rejected": -2.1986048221588135, "logps/chosen": -600.1227416992188, "logps/rejected": -496.6559143066406, "loss": 0.0258, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.368238925933838, "rewards/margins": 2.388805389404297, "rewards/rejected": -3.7570443153381348, "step": 390 }, { "epoch": 0.99, "learning_rate": 2.327445937151673e-10, "logits/chosen": -2.316849708557129, "logits/rejected": -2.1959729194641113, "logps/chosen": -559.0263061523438, "logps/rejected": -482.11773681640625, "loss": 0.0288, "rewards/accuracies": 0.78125, "rewards/chosen": -1.659148931503296, "rewards/margins": 2.0086750984191895, "rewards/rejected": -3.6678237915039062, "step": 400 }, { "epoch": 0.99, "eval_logits/chosen": -2.2708253860473633, "eval_logits/rejected": -2.245922565460205, "eval_logps/chosen": -513.0960083007812, "eval_logps/rejected": -552.8470458984375, "eval_loss": 0.06599809229373932, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": -2.560563564300537, "eval_rewards/margins": 0.3943747282028198, "eval_rewards/rejected": -2.9549384117126465, "eval_runtime": 53.3482, "eval_samples_per_second": 37.49, "eval_steps_per_second": 0.6, "step": 400 }, { "epoch": 1.0, "step": 405, "total_flos": 0.0, "train_loss": 0.07151281171374851, "train_runtime": 3738.25, "train_samples_per_second": 13.882, "train_steps_per_second": 0.108 } ], "logging_steps": 10, "max_steps": 405, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }