{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 16.79678467461835, "learning_rate": 1.923076923076923e-08, "logits/chosen": -0.5216625928878784, "logits/rejected": -1.6251907348632812, "logps/chosen": -339.42877197265625, "logps/rejected": -263.98431396484375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.04, "grad_norm": 17.217350690399922, "learning_rate": 1.9230769230769231e-07, "logits/chosen": -1.7395856380462646, "logits/rejected": -1.1197137832641602, "logps/chosen": -194.7418212890625, "logps/rejected": -322.9896240234375, "loss": 0.693, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": 0.0007130156154744327, "rewards/margins": 0.0009009492350742221, "rewards/rejected": -0.00018793345952872187, "step": 10 }, { "epoch": 0.08, "grad_norm": 17.755504451917034, "learning_rate": 3.8461538461538463e-07, "logits/chosen": -1.3529917001724243, "logits/rejected": -1.1581436395645142, "logps/chosen": -250.0107879638672, "logps/rejected": -309.7054138183594, "loss": 0.69, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.003061536233872175, "rewards/margins": 0.0077318595722317696, "rewards/rejected": -0.004670322872698307, "step": 20 }, { "epoch": 0.12, "grad_norm": 15.809698317809115, "learning_rate": 4.99613632163459e-07, "logits/chosen": -1.445340633392334, "logits/rejected": -0.8237818479537964, "logps/chosen": -262.8525695800781, "logps/rejected": -319.0481872558594, "loss": 0.6756, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.011689678765833378, "rewards/margins": 0.03540501371026039, "rewards/rejected": -0.02371532842516899, "step": 30 }, { "epoch": 0.16, "grad_norm": 17.829217597278124, "learning_rate": 4.952806974561517e-07, "logits/chosen": -1.3351449966430664, "logits/rejected": -0.8461858034133911, "logps/chosen": -231.9071807861328, "logps/rejected": -339.33740234375, "loss": 0.6256, "rewards/accuracies": 0.84375, "rewards/chosen": 0.006707000080496073, "rewards/margins": 0.18326039612293243, "rewards/rejected": -0.1765533834695816, "step": 40 }, { "epoch": 0.2, "grad_norm": 20.372527502778215, "learning_rate": 4.862157403595598e-07, "logits/chosen": -1.0431455373764038, "logits/rejected": -0.7218812704086304, "logps/chosen": -242.2100372314453, "logps/rejected": -352.3864440917969, "loss": 0.5443, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.04869229719042778, "rewards/margins": 0.5344938039779663, "rewards/rejected": -0.5831860303878784, "step": 50 }, { "epoch": 0.24, "grad_norm": 24.684374727803654, "learning_rate": 4.725936445085709e-07, "logits/chosen": -0.7200717329978943, "logits/rejected": -0.5803043842315674, "logps/chosen": -277.9634094238281, "logps/rejected": -517.9424438476562, "loss": 0.4741, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.17333514988422394, "rewards/margins": 1.385983943939209, "rewards/rejected": -1.559319019317627, "step": 60 }, { "epoch": 0.28, "grad_norm": 22.14152988454074, "learning_rate": 4.5467721110696685e-07, "logits/chosen": -0.515616774559021, "logits/rejected": -0.04089225083589554, "logps/chosen": -258.40606689453125, "logps/rejected": -408.2468566894531, "loss": 0.4328, "rewards/accuracies": 0.84375, "rewards/chosen": -0.24228188395500183, "rewards/margins": 1.2516155242919922, "rewards/rejected": -1.493897557258606, "step": 70 }, { "epoch": 0.32, "grad_norm": 18.065752819849052, "learning_rate": 4.328120888946271e-07, "logits/chosen": -0.7557204961776733, "logits/rejected": -0.010405841283500195, "logps/chosen": -258.38092041015625, "logps/rejected": -523.7661743164062, "loss": 0.4252, "rewards/accuracies": 0.84375, "rewards/chosen": -0.33308929204940796, "rewards/margins": 2.278934955596924, "rewards/rejected": -2.6120240688323975, "step": 80 }, { "epoch": 0.36, "grad_norm": 15.734543008245188, "learning_rate": 4.074201057973785e-07, "logits/chosen": -1.038962721824646, "logits/rejected": 0.15530693531036377, "logps/chosen": -266.2573547363281, "logps/rejected": -538.8092041015625, "loss": 0.3708, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.19907574355602264, "rewards/margins": 2.0345568656921387, "rewards/rejected": -2.2336325645446777, "step": 90 }, { "epoch": 0.4, "grad_norm": 24.698688306800403, "learning_rate": 3.789911309071252e-07, "logits/chosen": -0.2497592717409134, "logits/rejected": 0.24183444678783417, "logps/chosen": -277.8612365722656, "logps/rejected": -582.8795776367188, "loss": 0.34, "rewards/accuracies": 0.90625, "rewards/chosen": -0.30463331937789917, "rewards/margins": 2.518612861633301, "rewards/rejected": -2.823246479034424, "step": 100 }, { "epoch": 0.4, "eval_logits/chosen": -1.5214157104492188, "eval_logits/rejected": 0.016294823959469795, "eval_logps/chosen": -229.1130828857422, "eval_logps/rejected": -291.0631408691406, "eval_loss": 0.5715546011924744, "eval_rewards/accuracies": 0.7202380895614624, "eval_rewards/chosen": -0.2893332839012146, "eval_rewards/margins": 0.37313312292099, "eval_rewards/rejected": -0.6624664068222046, "eval_runtime": 410.8502, "eval_samples_per_second": 9.789, "eval_steps_per_second": 0.307, "step": 100 }, { "epoch": 0.44, "grad_norm": 20.355882133570365, "learning_rate": 3.4807362379317026e-07, "logits/chosen": -0.35275131464004517, "logits/rejected": 0.4792235493659973, "logps/chosen": -295.2680358886719, "logps/rejected": -693.8294677734375, "loss": 0.3238, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.3359094262123108, "rewards/margins": 3.903926372528076, "rewards/rejected": -4.239835739135742, "step": 110 }, { "epoch": 0.48, "grad_norm": 21.727867963945446, "learning_rate": 3.152640534699994e-07, "logits/chosen": -0.5010538697242737, "logits/rejected": 0.6018115878105164, "logps/chosen": -291.17840576171875, "logps/rejected": -668.4251098632812, "loss": 0.3106, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.3278064429759979, "rewards/margins": 3.2680602073669434, "rewards/rejected": -3.595867156982422, "step": 120 }, { "epoch": 0.52, "grad_norm": 37.26634759168863, "learning_rate": 2.811953911537022e-07, "logits/chosen": -0.57627934217453, "logits/rejected": 0.6331204175949097, "logps/chosen": -227.9058837890625, "logps/rejected": -711.4694213867188, "loss": 0.299, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.33088651299476624, "rewards/margins": 3.8740921020507812, "rewards/rejected": -4.204977989196777, "step": 130 }, { "epoch": 0.56, "grad_norm": 38.01185078661505, "learning_rate": 2.4652489880792125e-07, "logits/chosen": 0.14344072341918945, "logits/rejected": 0.4568979740142822, "logps/chosen": -297.4254455566406, "logps/rejected": -691.5115356445312, "loss": 0.2841, "rewards/accuracies": 0.9375, "rewards/chosen": -0.29703259468078613, "rewards/margins": 3.58467173576355, "rewards/rejected": -3.881704330444336, "step": 140 }, { "epoch": 0.6, "grad_norm": 23.175417086747803, "learning_rate": 2.1192144906604874e-07, "logits/chosen": -0.35931748151779175, "logits/rejected": 0.38914966583251953, "logps/chosen": -250.97146606445312, "logps/rejected": -848.0675659179688, "loss": 0.2553, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.24952073395252228, "rewards/margins": 5.0541534423828125, "rewards/rejected": -5.303674221038818, "step": 150 }, { "epoch": 0.63, "grad_norm": 20.4769087360566, "learning_rate": 1.780526211572016e-07, "logits/chosen": -0.17432162165641785, "logits/rejected": 0.5454779267311096, "logps/chosen": -261.4326477050781, "logps/rejected": -773.8187866210938, "loss": 0.2334, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.21171124279499054, "rewards/margins": 4.594513416290283, "rewards/rejected": -4.806224822998047, "step": 160 }, { "epoch": 0.67, "grad_norm": 37.16207244417959, "learning_rate": 1.4557182178490635e-07, "logits/chosen": -0.05513007566332817, "logits/rejected": 0.6765660047531128, "logps/chosen": -286.18804931640625, "logps/rejected": -864.0691528320312, "loss": 0.2465, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3841165602207184, "rewards/margins": 5.005141258239746, "rewards/rejected": -5.389257431030273, "step": 170 }, { "epoch": 0.71, "grad_norm": 18.25462627863088, "learning_rate": 1.1510567942602889e-07, "logits/chosen": -0.39599961042404175, "logits/rejected": 0.6781516075134277, "logps/chosen": -263.38983154296875, "logps/rejected": -744.1962280273438, "loss": 0.2155, "rewards/accuracies": 0.9375, "rewards/chosen": -0.31235271692276, "rewards/margins": 4.253737926483154, "rewards/rejected": -4.5660905838012695, "step": 180 }, { "epoch": 0.75, "grad_norm": 41.13316289752075, "learning_rate": 8.724195524258688e-08, "logits/chosen": -0.2813408672809601, "logits/rejected": 0.8199517130851746, "logps/chosen": -277.7645263671875, "logps/rejected": -847.7828369140625, "loss": 0.2412, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.3673214316368103, "rewards/margins": 5.06696081161499, "rewards/rejected": -5.434282302856445, "step": 190 }, { "epoch": 0.79, "grad_norm": 29.643198990740647, "learning_rate": 6.251820383244468e-08, "logits/chosen": -0.14953655004501343, "logits/rejected": 0.8468856811523438, "logps/chosen": -270.6844177246094, "logps/rejected": -735.6868286132812, "loss": 0.2189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2686000466346741, "rewards/margins": 4.150389194488525, "rewards/rejected": -4.418989658355713, "step": 200 }, { "epoch": 0.79, "eval_logits/chosen": -1.359634280204773, "eval_logits/rejected": 0.4307384192943573, "eval_logps/chosen": -248.32972717285156, "eval_logps/rejected": -347.9086608886719, "eval_loss": 0.5082926750183105, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.48149970173835754, "eval_rewards/margins": 0.749422013759613, "eval_rewards/rejected": -1.2309216260910034, "eval_runtime": 409.9823, "eval_samples_per_second": 9.81, "eval_steps_per_second": 0.307, "step": 200 }, { "epoch": 0.83, "grad_norm": 18.630491734171713, "learning_rate": 4.141140257879319e-08, "logits/chosen": 0.14813140034675598, "logits/rejected": 0.9779118299484253, "logps/chosen": -279.84320068359375, "logps/rejected": -633.3928833007812, "loss": 0.2247, "rewards/accuracies": 0.96875, "rewards/chosen": -0.1574528068304062, "rewards/margins": 3.489206314086914, "rewards/rejected": -3.6466591358184814, "step": 210 }, { "epoch": 0.87, "grad_norm": 28.497946746927017, "learning_rate": 2.4328749671846117e-08, "logits/chosen": -0.18863503634929657, "logits/rejected": 0.7909995317459106, "logps/chosen": -280.1761779785156, "logps/rejected": -809.5506591796875, "loss": 0.2107, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3041425049304962, "rewards/margins": 4.856934547424316, "rewards/rejected": -5.161076545715332, "step": 220 }, { "epoch": 0.91, "grad_norm": 23.671408143492002, "learning_rate": 1.1599808329836174e-08, "logits/chosen": 0.07416832447052002, "logits/rejected": 0.9995840787887573, "logps/chosen": -275.8365783691406, "logps/rejected": -751.8892822265625, "loss": 0.2143, "rewards/accuracies": 0.96875, "rewards/chosen": -0.24566996097564697, "rewards/margins": 4.163148403167725, "rewards/rejected": -4.408819198608398, "step": 230 }, { "epoch": 0.95, "grad_norm": 27.856145151796408, "learning_rate": 3.4701487751534475e-09, "logits/chosen": -0.11121706664562225, "logits/rejected": 0.7320507168769836, "logps/chosen": -341.3576965332031, "logps/rejected": -890.82421875, "loss": 0.215, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3052319586277008, "rewards/margins": 5.738637924194336, "rewards/rejected": -6.043869972229004, "step": 240 }, { "epoch": 0.99, "grad_norm": 25.41636406288576, "learning_rate": 9.661062636148743e-11, "logits/chosen": -0.6346914172172546, "logits/rejected": 0.8026138544082642, "logps/chosen": -268.1880187988281, "logps/rejected": -801.5145263671875, "loss": 0.186, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3312497138977051, "rewards/margins": 3.949364185333252, "rewards/rejected": -4.280614376068115, "step": 250 }, { "epoch": 1.0, "step": 252, "total_flos": 0.0, "train_loss": 0.35703677506673903, "train_runtime": 4562.5722, "train_samples_per_second": 3.533, "train_steps_per_second": 0.055 } ], "logging_steps": 10, "max_steps": 252, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }