{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 309, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 6.266341746346053, "learning_rate": 1.6129032258064518e-07, "logits/chosen": -0.5416143536567688, "logits/rejected": -0.9699263572692871, "logps/chosen": -998.3239135742188, "logps/rejected": -1286.9267578125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 6.79176132535156, "learning_rate": 1.6129032258064516e-06, "logits/chosen": -0.501338541507721, "logits/rejected": -0.6205970048904419, "logps/chosen": -965.1869506835938, "logps/rejected": -1388.869140625, "loss": 0.6908, "rewards/accuracies": 0.4791666567325592, "rewards/chosen": -0.005239578895270824, "rewards/margins": 0.0034307329915463924, "rewards/rejected": -0.00867031142115593, "step": 10 }, { "epoch": 0.06, "grad_norm": 5.46084581758702, "learning_rate": 3.225806451612903e-06, "logits/chosen": -0.4920225143432617, "logits/rejected": -0.5186491012573242, "logps/chosen": -1001.7984619140625, "logps/rejected": -1416.242431640625, "loss": 0.6367, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13937075436115265, "rewards/margins": 0.18681207299232483, "rewards/rejected": -0.32618284225463867, "step": 20 }, { "epoch": 0.1, "grad_norm": 5.423152706060893, "learning_rate": 4.838709677419355e-06, "logits/chosen": -0.3083574175834656, "logits/rejected": -0.28873246908187866, "logps/chosen": -936.2586669921875, "logps/rejected": -1426.4195556640625, "loss": 0.4832, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.22132034599781036, "rewards/margins": 0.7907422780990601, "rewards/rejected": -1.012062668800354, "step": 30 }, { "epoch": 0.13, "grad_norm": 4.497512811390226, "learning_rate": 4.987080943856887e-06, "logits/chosen": -0.39709392189979553, "logits/rejected": -0.3410794734954834, "logps/chosen": -864.56787109375, "logps/rejected": -1415.4027099609375, "loss": 0.431, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10076986253261566, "rewards/margins": 1.5246957540512085, "rewards/rejected": -1.625465750694275, "step": 40 }, { "epoch": 0.16, "grad_norm": 5.38009380317694, "learning_rate": 4.942593872763566e-06, "logits/chosen": -0.2893625795841217, "logits/rejected": -0.3507555425167084, "logps/chosen": -904.3942260742188, "logps/rejected": -1616.10986328125, "loss": 0.3741, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.2879861891269684, "rewards/margins": 2.9569807052612305, "rewards/rejected": -3.244966983795166, "step": 50 }, { "epoch": 0.19, "grad_norm": 4.021367545314158, "learning_rate": 4.866946677079314e-06, "logits/chosen": -0.2734339237213135, "logits/rejected": -0.2639048397541046, "logps/chosen": -1028.2137451171875, "logps/rejected": -1476.271728515625, "loss": 0.349, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4066247045993805, "rewards/margins": 1.8377554416656494, "rewards/rejected": -2.244380235671997, "step": 60 }, { "epoch": 0.23, "grad_norm": 5.598262404371225, "learning_rate": 4.761104386672074e-06, "logits/chosen": -0.1697017401456833, "logits/rejected": -0.25459176301956177, "logps/chosen": -949.3585205078125, "logps/rejected": -1626.849365234375, "loss": 0.3077, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.5060699582099915, "rewards/margins": 2.161994457244873, "rewards/rejected": -2.6680643558502197, "step": 70 }, { "epoch": 0.26, "grad_norm": 4.260536872127374, "learning_rate": 4.626417229671401e-06, "logits/chosen": -0.22611574828624725, "logits/rejected": -0.2613917291164398, "logps/chosen": -920.0872192382812, "logps/rejected": -1731.564453125, "loss": 0.2782, "rewards/accuracies": 0.90625, "rewards/chosen": -0.45848578214645386, "rewards/margins": 3.2202796936035156, "rewards/rejected": -3.6787655353546143, "step": 80 }, { "epoch": 0.29, "grad_norm": 5.248398944622536, "learning_rate": 4.464603407633326e-06, "logits/chosen": -0.20690715312957764, "logits/rejected": -0.3079308271408081, "logps/chosen": -1012.1229248046875, "logps/rejected": -1682.8330078125, "loss": 0.2599, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.40270423889160156, "rewards/margins": 3.2318668365478516, "rewards/rejected": -3.634571075439453, "step": 90 }, { "epoch": 0.32, "grad_norm": 4.915340212981017, "learning_rate": 4.2777271764750805e-06, "logits/chosen": -0.27068111300468445, "logits/rejected": -0.22187161445617676, "logps/chosen": -952.927734375, "logps/rejected": -1708.844482421875, "loss": 0.2249, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.5603285431861877, "rewards/margins": 3.670332670211792, "rewards/rejected": -4.230660915374756, "step": 100 }, { "epoch": 0.32, "eval_logits/chosen": -0.1106632873415947, "eval_logits/rejected": -0.028987836092710495, "eval_logps/chosen": -650.2339477539062, "eval_logps/rejected": -1190.2701416015625, "eval_loss": 0.3820802569389343, "eval_rewards/accuracies": 0.8551136255264282, "eval_rewards/chosen": -0.9338886141777039, "eval_rewards/margins": 3.643230438232422, "eval_rewards/rejected": -4.577118873596191, "eval_runtime": 178.7591, "eval_samples_per_second": 7.787, "eval_steps_per_second": 0.246, "step": 100 }, { "epoch": 0.36, "grad_norm": 3.910365611216077, "learning_rate": 4.06817251280076e-06, "logits/chosen": -0.20240063965320587, "logits/rejected": -0.265985906124115, "logps/chosen": -1035.5126953125, "logps/rejected": -1685.556640625, "loss": 0.2552, "rewards/accuracies": 0.90625, "rewards/chosen": -0.44126248359680176, "rewards/margins": 3.790078639984131, "rewards/rejected": -4.231341361999512, "step": 110 }, { "epoch": 0.39, "grad_norm": 9.625288099258173, "learning_rate": 3.838612701556138e-06, "logits/chosen": -0.243825763463974, "logits/rejected": -0.3290537893772125, "logps/chosen": -967.8455810546875, "logps/rejected": -1898.212646484375, "loss": 0.2415, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19587793946266174, "rewards/margins": 3.995615005493164, "rewards/rejected": -4.191493034362793, "step": 120 }, { "epoch": 0.42, "grad_norm": 3.4423605201536476, "learning_rate": 3.5919762329823556e-06, "logits/chosen": -0.22354824841022491, "logits/rejected": -0.2977936565876007, "logps/chosen": -962.7554931640625, "logps/rejected": -1781.5015869140625, "loss": 0.2134, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3282354772090912, "rewards/margins": 4.096698760986328, "rewards/rejected": -4.424933910369873, "step": 130 }, { "epoch": 0.45, "grad_norm": 4.355526701031263, "learning_rate": 3.3314094439203903e-06, "logits/chosen": -0.21654577553272247, "logits/rejected": -0.3782255947589874, "logps/chosen": -956.0657348632812, "logps/rejected": -1783.807861328125, "loss": 0.2054, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.5269081592559814, "rewards/margins": 4.415614128112793, "rewards/rejected": -4.9425225257873535, "step": 140 }, { "epoch": 0.49, "grad_norm": 4.577252564770749, "learning_rate": 3.0602363800505198e-06, "logits/chosen": -0.21595144271850586, "logits/rejected": -0.3744010925292969, "logps/chosen": -1055.1593017578125, "logps/rejected": -1893.0072021484375, "loss": 0.1989, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3978404402732849, "rewards/margins": 5.1758904457092285, "rewards/rejected": -5.573730945587158, "step": 150 }, { "epoch": 0.52, "grad_norm": 4.231494324857874, "learning_rate": 2.7819163911034175e-06, "logits/chosen": -0.22502343356609344, "logits/rejected": -0.37728679180145264, "logps/chosen": -989.0603637695312, "logps/rejected": -1922.759521484375, "loss": 0.1431, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.17479470372200012, "rewards/margins": 5.446383476257324, "rewards/rejected": -5.62117862701416, "step": 160 }, { "epoch": 0.55, "grad_norm": 6.2160398163872665, "learning_rate": 2.5e-06, "logits/chosen": -0.2915242314338684, "logits/rejected": -0.45713406801223755, "logps/chosen": -1042.622314453125, "logps/rejected": -2141.868896484375, "loss": 0.1686, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.609991192817688, "rewards/margins": 7.452821254730225, "rewards/rejected": -8.062813758850098, "step": 170 }, { "epoch": 0.58, "grad_norm": 2.781036420408215, "learning_rate": 2.2180836088965833e-06, "logits/chosen": -0.3469601273536682, "logits/rejected": -0.5014016032218933, "logps/chosen": -1045.5155029296875, "logps/rejected": -1821.9827880859375, "loss": 0.1551, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07482553273439407, "rewards/margins": 5.159177303314209, "rewards/rejected": -5.234003067016602, "step": 180 }, { "epoch": 0.61, "grad_norm": 3.3125651058366268, "learning_rate": 1.939763619949481e-06, "logits/chosen": -0.35754817724227905, "logits/rejected": -0.5549635887145996, "logps/chosen": -864.7000732421875, "logps/rejected": -1795.1302490234375, "loss": 0.1376, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1424350142478943, "rewards/margins": 4.835862636566162, "rewards/rejected": -4.978297710418701, "step": 190 }, { "epoch": 0.65, "grad_norm": 4.010712916190072, "learning_rate": 1.6685905560796101e-06, "logits/chosen": -0.46689772605895996, "logits/rejected": -0.6180375814437866, "logps/chosen": -1065.89599609375, "logps/rejected": -2047.24609375, "loss": 0.1549, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5212541222572327, "rewards/margins": 6.1373467445373535, "rewards/rejected": -6.658600807189941, "step": 200 }, { "epoch": 0.65, "eval_logits/chosen": -0.3179807960987091, "eval_logits/rejected": -0.32223665714263916, "eval_logps/chosen": -652.9113159179688, "eval_logps/rejected": -1308.40478515625, "eval_loss": 0.27093741297721863, "eval_rewards/accuracies": 0.8977272510528564, "eval_rewards/chosen": -0.960662305355072, "eval_rewards/margins": 4.797802925109863, "eval_rewards/rejected": -5.758464813232422, "eval_runtime": 173.283, "eval_samples_per_second": 8.033, "eval_steps_per_second": 0.254, "step": 200 }, { "epoch": 0.68, "grad_norm": 3.8323612831804175, "learning_rate": 1.4080237670176456e-06, "logits/chosen": -0.4167974889278412, "logits/rejected": -0.5508753061294556, "logps/chosen": -1013.05419921875, "logps/rejected": -1894.520263671875, "loss": 0.1443, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.12083841860294342, "rewards/margins": 5.549345970153809, "rewards/rejected": -5.67018461227417, "step": 210 }, { "epoch": 0.71, "grad_norm": 4.939784171225143, "learning_rate": 1.161387298443863e-06, "logits/chosen": -0.3491138815879822, "logits/rejected": -0.5552398562431335, "logps/chosen": -902.9434814453125, "logps/rejected": -1838.8209228515625, "loss": 0.148, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.21528509259223938, "rewards/margins": 5.609208583831787, "rewards/rejected": -5.824493408203125, "step": 220 }, { "epoch": 0.74, "grad_norm": 5.989265642846245, "learning_rate": 9.318274871992408e-07, "logits/chosen": -0.44859474897384644, "logits/rejected": -0.6015830636024475, "logps/chosen": -1067.4637451171875, "logps/rejected": -2109.045654296875, "loss": 0.157, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.305181622505188, "rewards/margins": 7.274392604827881, "rewards/rejected": -7.579575538635254, "step": 230 }, { "epoch": 0.78, "grad_norm": 4.078696847368483, "learning_rate": 7.222728235249196e-07, "logits/chosen": -0.4398832321166992, "logits/rejected": -0.5384049415588379, "logps/chosen": -971.4193115234375, "logps/rejected": -1864.9664306640625, "loss": 0.1321, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.1642368584871292, "rewards/margins": 5.839243412017822, "rewards/rejected": -6.003479957580566, "step": 240 }, { "epoch": 0.81, "grad_norm": 2.772016531721353, "learning_rate": 5.353965923666743e-07, "logits/chosen": -0.4434467852115631, "logits/rejected": -0.5806728601455688, "logps/chosen": -985.8177490234375, "logps/rejected": -1902.5126953125, "loss": 0.125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.21859340369701385, "rewards/margins": 5.029125213623047, "rewards/rejected": -5.247718334197998, "step": 250 }, { "epoch": 0.84, "grad_norm": 3.5375826568350557, "learning_rate": 3.7358277032860016e-07, "logits/chosen": -0.47191086411476135, "logits/rejected": -0.5328727960586548, "logps/chosen": -913.7701416015625, "logps/rejected": -2043.599365234375, "loss": 0.1244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4412022531032562, "rewards/margins": 6.3683366775512695, "rewards/rejected": -6.809537410736084, "step": 260 }, { "epoch": 0.87, "grad_norm": 3.1823409653630907, "learning_rate": 2.388956133279266e-07, "logits/chosen": -0.3901548683643341, "logits/rejected": -0.4868335723876953, "logps/chosen": -891.0558471679688, "logps/rejected": -1852.1605224609375, "loss": 0.1274, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5433205366134644, "rewards/margins": 5.383849620819092, "rewards/rejected": -5.927170753479004, "step": 270 }, { "epoch": 0.91, "grad_norm": 2.5807533458880316, "learning_rate": 1.3305332292068706e-07, "logits/chosen": -0.3871026039123535, "logits/rejected": -0.5441304445266724, "logps/chosen": -887.1787109375, "logps/rejected": -1961.8681640625, "loss": 0.0871, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.31611892580986023, "rewards/margins": 5.871817588806152, "rewards/rejected": -6.187936782836914, "step": 280 }, { "epoch": 0.94, "grad_norm": 2.1667077758960303, "learning_rate": 5.7406127236434016e-08, "logits/chosen": -0.45710650086402893, "logits/rejected": -0.6058652400970459, "logps/chosen": -984.42578125, "logps/rejected": -2196.794189453125, "loss": 0.0998, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6274508237838745, "rewards/margins": 7.608497619628906, "rewards/rejected": -8.23594856262207, "step": 290 }, { "epoch": 0.97, "grad_norm": 2.549184554041034, "learning_rate": 1.2919056143113062e-08, "logits/chosen": -0.4359508156776428, "logits/rejected": -0.6320601105690002, "logps/chosen": -949.6388549804688, "logps/rejected": -2248.93798828125, "loss": 0.0946, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5125720500946045, "rewards/margins": 7.058285713195801, "rewards/rejected": -7.570857048034668, "step": 300 }, { "epoch": 0.97, "eval_logits/chosen": -0.3523492217063904, "eval_logits/rejected": -0.3282558023929596, "eval_logps/chosen": -679.615478515625, "eval_logps/rejected": -1442.4718017578125, "eval_loss": 0.27558115124702454, "eval_rewards/accuracies": 0.8920454382896423, "eval_rewards/chosen": -1.2277040481567383, "eval_rewards/margins": 5.87143087387085, "eval_rewards/rejected": -7.099134922027588, "eval_runtime": 171.6402, "eval_samples_per_second": 8.11, "eval_steps_per_second": 0.256, "step": 300 }, { "epoch": 1.0, "step": 309, "total_flos": 0.0, "train_loss": 0.0030385508506429234, "train_runtime": 144.6456, "train_samples_per_second": 136.617, "train_steps_per_second": 2.136 } ], "logging_steps": 10, "max_steps": 309, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }