{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9904153354632586, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006389776357827476, "grad_norm": 11.092770897372258, "learning_rate": 1.0638297872340425e-08, "logits/chosen": -1.4375, "logits/rejected": -1.4765625, "logps/chosen": -124.0, "logps/rejected": -106.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06389776357827476, "grad_norm": 11.4814018660731, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.6171875, "logits/rejected": -1.546875, "logps/chosen": -148.0, "logps/rejected": -131.0, "loss": 0.693, "rewards/accuracies": 0.2222222238779068, "rewards/chosen": 0.000873565673828125, "rewards/margins": -0.0019073486328125, "rewards/rejected": 0.002777099609375, "step": 10 }, { "epoch": 0.12779552715654952, "grad_norm": 10.808846928393402, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.59375, "logits/rejected": -1.5, "logps/chosen": -148.0, "logps/rejected": -119.0, "loss": 0.6909, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00093841552734375, "rewards/margins": 0.00909423828125, "rewards/rejected": -0.00811767578125, "step": 20 }, { "epoch": 0.19169329073482427, "grad_norm": 11.114832070482858, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.53125, "logps/chosen": -138.0, "logps/rejected": -130.0, "loss": 0.688, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.022705078125, "rewards/margins": 0.01031494140625, "rewards/rejected": -0.032958984375, "step": 30 }, { "epoch": 0.25559105431309903, "grad_norm": 10.629612292618301, "learning_rate": 4.25531914893617e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.5234375, "logps/chosen": -137.0, "logps/rejected": -122.0, "loss": 0.6849, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0517578125, "rewards/margins": 0.0220947265625, "rewards/rejected": -0.07373046875, "step": 40 }, { "epoch": 0.3194888178913738, "grad_norm": 10.309070114270924, "learning_rate": 4.96437054631829e-07, "logits/chosen": -1.59375, "logits/rejected": -1.5703125, "logps/chosen": -148.0, "logps/rejected": -134.0, "loss": 0.6751, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.11767578125, "rewards/margins": 0.02880859375, "rewards/rejected": -0.146484375, "step": 50 }, { "epoch": 0.38338658146964855, "grad_norm": 10.691839409341034, "learning_rate": 4.845605700712589e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.5390625, "logps/chosen": -138.0, "logps/rejected": -126.0, "loss": 0.6763, "rewards/accuracies": 0.53125, "rewards/chosen": -0.208984375, "rewards/margins": 0.0380859375, "rewards/rejected": -0.2470703125, "step": 60 }, { "epoch": 0.4472843450479233, "grad_norm": 10.976743522383872, "learning_rate": 4.7268408551068883e-07, "logits/chosen": -1.578125, "logits/rejected": -1.5078125, "logps/chosen": -146.0, "logps/rejected": -130.0, "loss": 0.6659, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.158203125, "rewards/margins": 0.08154296875, "rewards/rejected": -0.2392578125, "step": 70 }, { "epoch": 0.5111821086261981, "grad_norm": 9.937714950468724, "learning_rate": 4.6080760095011875e-07, "logits/chosen": -1.546875, "logits/rejected": -1.5390625, "logps/chosen": -139.0, "logps/rejected": -134.0, "loss": 0.6732, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.25390625, "rewards/margins": 0.057373046875, "rewards/rejected": -0.3125, "step": 80 }, { "epoch": 0.5750798722044729, "grad_norm": 10.448001432920702, "learning_rate": 4.4893111638954866e-07, "logits/chosen": -1.53125, "logits/rejected": -1.5078125, "logps/chosen": -142.0, "logps/rejected": -135.0, "loss": 0.659, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3125, "rewards/margins": 0.08935546875, "rewards/rejected": -0.40234375, "step": 90 }, { "epoch": 0.6389776357827476, "grad_norm": 10.03442318899385, "learning_rate": 4.3705463182897863e-07, "logits/chosen": -1.59375, "logits/rejected": -1.5625, "logps/chosen": -152.0, "logps/rejected": -136.0, "loss": 0.6485, "rewards/accuracies": 0.65625, "rewards/chosen": -0.306640625, "rewards/margins": 0.154296875, "rewards/rejected": -0.4609375, "step": 100 }, { "epoch": 0.7028753993610224, "grad_norm": 10.349608536461915, "learning_rate": 4.251781472684085e-07, "logits/chosen": -1.5, "logits/rejected": -1.484375, "logps/chosen": -142.0, "logps/rejected": -136.0, "loss": 0.6387, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.49609375, "rewards/margins": 0.130859375, "rewards/rejected": -0.625, "step": 110 }, { "epoch": 0.7667731629392971, "grad_norm": 10.539570985663747, "learning_rate": 4.1330166270783846e-07, "logits/chosen": -1.5078125, "logits/rejected": -1.484375, "logps/chosen": -140.0, "logps/rejected": -132.0, "loss": 0.6411, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.59375, "rewards/margins": 0.1728515625, "rewards/rejected": -0.76953125, "step": 120 }, { "epoch": 0.8306709265175719, "grad_norm": 9.94604093638921, "learning_rate": 4.0142517814726837e-07, "logits/chosen": -1.5625, "logits/rejected": -1.484375, "logps/chosen": -153.0, "logps/rejected": -127.5, "loss": 0.6196, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.66015625, "rewards/margins": 0.337890625, "rewards/rejected": -0.99609375, "step": 130 }, { "epoch": 0.8945686900958466, "grad_norm": 9.253374940542843, "learning_rate": 3.8954869358669834e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.4765625, "logps/chosen": -155.0, "logps/rejected": -141.0, "loss": 0.6091, "rewards/accuracies": 0.65625, "rewards/chosen": -0.83984375, "rewards/margins": 0.28125, "rewards/rejected": -1.1171875, "step": 140 }, { "epoch": 0.9584664536741214, "grad_norm": 9.147492605620393, "learning_rate": 3.7767220902612825e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.5078125, "logps/chosen": -160.0, "logps/rejected": -142.0, "loss": 0.6096, "rewards/accuracies": 0.71875, "rewards/chosen": -0.80859375, "rewards/margins": 0.349609375, "rewards/rejected": -1.15625, "step": 150 }, { "epoch": 0.9968051118210862, "eval_logits/chosen": -1.484375, "eval_logits/rejected": -1.4765625, "eval_logps/chosen": -145.0, "eval_logps/rejected": -134.0, "eval_loss": 0.6089843511581421, "eval_rewards/accuracies": 0.6607142686843872, "eval_rewards/chosen": -0.65234375, "eval_rewards/margins": 0.34765625, "eval_rewards/rejected": -1.0, "eval_runtime": 11.4557, "eval_samples_per_second": 17.459, "eval_steps_per_second": 0.611, "step": 156 }, { "epoch": 1.0223642172523961, "grad_norm": 9.22711379768867, "learning_rate": 3.6579572446555817e-07, "logits/chosen": -1.4921875, "logits/rejected": -1.46875, "logps/chosen": -141.0, "logps/rejected": -139.0, "loss": 0.5811, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8828125, "rewards/margins": 0.34765625, "rewards/rejected": -1.2265625, "step": 160 }, { "epoch": 1.0862619808306708, "grad_norm": 9.508478185274804, "learning_rate": 3.5391923990498813e-07, "logits/chosen": -1.5078125, "logits/rejected": -1.5, "logps/chosen": -145.0, "logps/rejected": -134.0, "loss": 0.5337, "rewards/accuracies": 0.78125, "rewards/chosen": -0.71875, "rewards/margins": 0.5546875, "rewards/rejected": -1.2734375, "step": 170 }, { "epoch": 1.1501597444089458, "grad_norm": 9.316918076373987, "learning_rate": 3.42042755344418e-07, "logits/chosen": -1.5625, "logits/rejected": -1.53125, "logps/chosen": -157.0, "logps/rejected": -146.0, "loss": 0.544, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.66015625, "rewards/margins": 0.56640625, "rewards/rejected": -1.2265625, "step": 180 }, { "epoch": 1.2140575079872205, "grad_norm": 8.759483351227418, "learning_rate": 3.3016627078384796e-07, "logits/chosen": -1.5703125, "logits/rejected": -1.5234375, "logps/chosen": -152.0, "logps/rejected": -146.0, "loss": 0.5362, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.74609375, "rewards/margins": 0.5, "rewards/rejected": -1.25, "step": 190 }, { "epoch": 1.2779552715654952, "grad_norm": 9.97195116624129, "learning_rate": 3.182897862232779e-07, "logits/chosen": -1.515625, "logits/rejected": -1.515625, "logps/chosen": -151.0, "logps/rejected": -141.0, "loss": 0.5373, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.7890625, "rewards/margins": 0.55859375, "rewards/rejected": -1.34375, "step": 200 }, { "epoch": 1.34185303514377, "grad_norm": 8.576220811968858, "learning_rate": 3.0641330166270784e-07, "logits/chosen": -1.578125, "logits/rejected": -1.5234375, "logps/chosen": -151.0, "logps/rejected": -140.0, "loss": 0.5339, "rewards/accuracies": 0.6875, "rewards/chosen": -0.83984375, "rewards/margins": 0.462890625, "rewards/rejected": -1.3046875, "step": 210 }, { "epoch": 1.4057507987220448, "grad_norm": 8.688671845017401, "learning_rate": 2.9453681710213776e-07, "logits/chosen": -1.53125, "logits/rejected": -1.53125, "logps/chosen": -148.0, "logps/rejected": -143.0, "loss": 0.5366, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8671875, "rewards/margins": 0.5078125, "rewards/rejected": -1.375, "step": 220 }, { "epoch": 1.4696485623003195, "grad_norm": 9.030983047385451, "learning_rate": 2.8266033254156767e-07, "logits/chosen": -1.546875, "logits/rejected": -1.515625, "logps/chosen": -145.0, "logps/rejected": -142.0, "loss": 0.5346, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8046875, "rewards/margins": 0.53515625, "rewards/rejected": -1.34375, "step": 230 }, { "epoch": 1.5335463258785942, "grad_norm": 11.622095941624893, "learning_rate": 2.7078384798099764e-07, "logits/chosen": -1.484375, "logits/rejected": -1.4609375, "logps/chosen": -145.0, "logps/rejected": -153.0, "loss": 0.553, "rewards/accuracies": 0.75, "rewards/chosen": -0.80859375, "rewards/margins": 0.51171875, "rewards/rejected": -1.3203125, "step": 240 }, { "epoch": 1.5974440894568689, "grad_norm": 9.38942767820535, "learning_rate": 2.589073634204275e-07, "logits/chosen": -1.5625, "logits/rejected": -1.4765625, "logps/chosen": -146.0, "logps/rejected": -132.0, "loss": 0.54, "rewards/accuracies": 0.78125, "rewards/chosen": -0.50390625, "rewards/margins": 0.578125, "rewards/rejected": -1.078125, "step": 250 }, { "epoch": 1.6613418530351438, "grad_norm": 9.87005880531835, "learning_rate": 2.4703087885985747e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.5234375, "logps/chosen": -160.0, "logps/rejected": -148.0, "loss": 0.5312, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.55859375, "rewards/margins": 0.5546875, "rewards/rejected": -1.1171875, "step": 260 }, { "epoch": 1.7252396166134185, "grad_norm": 8.251251897175216, "learning_rate": 2.351543942992874e-07, "logits/chosen": -1.5234375, "logits/rejected": -1.4921875, "logps/chosen": -146.0, "logps/rejected": -143.0, "loss": 0.5354, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.609375, "rewards/margins": 0.56640625, "rewards/rejected": -1.171875, "step": 270 }, { "epoch": 1.7891373801916934, "grad_norm": 9.817987165601615, "learning_rate": 2.2327790973871732e-07, "logits/chosen": -1.53125, "logits/rejected": -1.4921875, "logps/chosen": -148.0, "logps/rejected": -144.0, "loss": 0.5347, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5546875, "rewards/margins": 0.58984375, "rewards/rejected": -1.1484375, "step": 280 }, { "epoch": 1.8530351437699681, "grad_norm": 9.610664273576093, "learning_rate": 2.1140142517814726e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.4921875, "logps/chosen": -164.0, "logps/rejected": -148.0, "loss": 0.5186, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.57421875, "rewards/margins": 0.59375, "rewards/rejected": -1.1640625, "step": 290 }, { "epoch": 1.9169329073482428, "grad_norm": 9.80773406850179, "learning_rate": 1.9952494061757718e-07, "logits/chosen": -1.53125, "logits/rejected": -1.5078125, "logps/chosen": -151.0, "logps/rejected": -148.0, "loss": 0.5476, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.75, "rewards/margins": 0.439453125, "rewards/rejected": -1.1875, "step": 300 }, { "epoch": 1.9808306709265175, "grad_norm": 9.05899810741374, "learning_rate": 1.876484560570071e-07, "logits/chosen": -1.578125, "logits/rejected": -1.53125, "logps/chosen": -156.0, "logps/rejected": -144.0, "loss": 0.5245, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.71875, "rewards/margins": 0.5703125, "rewards/rejected": -1.2890625, "step": 310 }, { "epoch": 2.0, "eval_logits/chosen": -1.5, "eval_logits/rejected": -1.4921875, "eval_logps/chosen": -144.0, "eval_logps/rejected": -134.0, "eval_loss": 0.5994531512260437, "eval_rewards/accuracies": 0.6964285969734192, "eval_rewards/chosen": -0.546875, "eval_rewards/margins": 0.458984375, "eval_rewards/rejected": -1.0078125, "eval_runtime": 11.4667, "eval_samples_per_second": 17.442, "eval_steps_per_second": 0.61, "step": 313 }, { "epoch": 2.0447284345047922, "grad_norm": 8.064019736193583, "learning_rate": 1.7577197149643706e-07, "logits/chosen": -1.546875, "logits/rejected": -1.5, "logps/chosen": -155.0, "logps/rejected": -145.0, "loss": 0.4849, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.84765625, "rewards/margins": 0.671875, "rewards/rejected": -1.515625, "step": 320 }, { "epoch": 2.108626198083067, "grad_norm": 7.485198225503147, "learning_rate": 1.6389548693586697e-07, "logits/chosen": -1.5625, "logits/rejected": -1.5078125, "logps/chosen": -144.0, "logps/rejected": -136.0, "loss": 0.495, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8671875, "rewards/margins": 0.60546875, "rewards/rejected": -1.4765625, "step": 330 }, { "epoch": 2.1725239616613417, "grad_norm": 9.966020126485736, "learning_rate": 1.520190023752969e-07, "logits/chosen": -1.546875, "logits/rejected": -1.5, "logps/chosen": -149.0, "logps/rejected": -137.0, "loss": 0.4747, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.83203125, "rewards/margins": 0.7109375, "rewards/rejected": -1.546875, "step": 340 }, { "epoch": 2.236421725239617, "grad_norm": 9.297651299143556, "learning_rate": 1.4014251781472683e-07, "logits/chosen": -1.5625, "logits/rejected": -1.53125, "logps/chosen": -152.0, "logps/rejected": -144.0, "loss": 0.4823, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.8203125, "rewards/margins": 0.640625, "rewards/rejected": -1.4609375, "step": 350 }, { "epoch": 2.3003194888178915, "grad_norm": 9.172329731503968, "learning_rate": 1.2826603325415677e-07, "logits/chosen": -1.53125, "logits/rejected": -1.5078125, "logps/chosen": -146.0, "logps/rejected": -137.0, "loss": 0.4743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8046875, "rewards/margins": 0.74609375, "rewards/rejected": -1.546875, "step": 360 }, { "epoch": 2.364217252396166, "grad_norm": 9.394249924534705, "learning_rate": 1.163895486935867e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.5390625, "logps/chosen": -164.0, "logps/rejected": -151.0, "loss": 0.4695, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5703125, "rewards/margins": 0.69921875, "rewards/rejected": -1.2734375, "step": 370 }, { "epoch": 2.428115015974441, "grad_norm": 8.43211550191306, "learning_rate": 1.0451306413301662e-07, "logits/chosen": -1.546875, "logits/rejected": -1.515625, "logps/chosen": -141.0, "logps/rejected": -136.0, "loss": 0.4747, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8515625, "rewards/margins": 0.69140625, "rewards/rejected": -1.5390625, "step": 380 }, { "epoch": 2.4920127795527156, "grad_norm": 9.162321870225993, "learning_rate": 9.263657957244655e-08, "logits/chosen": -1.5234375, "logits/rejected": -1.4921875, "logps/chosen": -148.0, "logps/rejected": -144.0, "loss": 0.4763, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8671875, "rewards/margins": 0.6640625, "rewards/rejected": -1.53125, "step": 390 }, { "epoch": 2.5559105431309903, "grad_norm": 9.156556482449458, "learning_rate": 8.076009501187649e-08, "logits/chosen": -1.5703125, "logits/rejected": -1.5234375, "logps/chosen": -153.0, "logps/rejected": -139.0, "loss": 0.468, "rewards/accuracies": 0.8125, "rewards/chosen": -0.79296875, "rewards/margins": 0.66796875, "rewards/rejected": -1.4609375, "step": 400 }, { "epoch": 2.619808306709265, "grad_norm": 8.694584668338557, "learning_rate": 6.88836104513064e-08, "logits/chosen": -1.5625, "logits/rejected": -1.5, "logps/chosen": -155.0, "logps/rejected": -143.0, "loss": 0.4845, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.79296875, "rewards/margins": 0.67578125, "rewards/rejected": -1.46875, "step": 410 }, { "epoch": 2.68370607028754, "grad_norm": 8.76105196917074, "learning_rate": 5.700712589073634e-08, "logits/chosen": -1.5390625, "logits/rejected": -1.5, "logps/chosen": -151.0, "logps/rejected": -144.0, "loss": 0.4679, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.80078125, "rewards/margins": 0.70703125, "rewards/rejected": -1.5078125, "step": 420 }, { "epoch": 2.747603833865815, "grad_norm": 10.082882017826297, "learning_rate": 4.5130641330166267e-08, "logits/chosen": -1.5078125, "logits/rejected": -1.46875, "logps/chosen": -145.0, "logps/rejected": -142.0, "loss": 0.4617, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.765625, "rewards/margins": 0.78125, "rewards/rejected": -1.546875, "step": 430 }, { "epoch": 2.8115015974440896, "grad_norm": 8.539769794548567, "learning_rate": 3.32541567695962e-08, "logits/chosen": -1.53125, "logits/rejected": -1.4765625, "logps/chosen": -142.0, "logps/rejected": -136.0, "loss": 0.4761, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.8203125, "rewards/margins": 0.80859375, "rewards/rejected": -1.625, "step": 440 }, { "epoch": 2.8753993610223643, "grad_norm": 9.882517602717023, "learning_rate": 2.1377672209026125e-08, "logits/chosen": -1.5859375, "logits/rejected": -1.5390625, "logps/chosen": -149.0, "logps/rejected": -148.0, "loss": 0.4793, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.75390625, "rewards/margins": 0.66796875, "rewards/rejected": -1.421875, "step": 450 }, { "epoch": 2.939297124600639, "grad_norm": 8.662355843337348, "learning_rate": 9.501187648456057e-09, "logits/chosen": -1.5546875, "logits/rejected": -1.5, "logps/chosen": -153.0, "logps/rejected": -143.0, "loss": 0.4931, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.87890625, "rewards/margins": 0.6953125, "rewards/rejected": -1.578125, "step": 460 }, { "epoch": 2.9904153354632586, "eval_logits/chosen": -1.5, "eval_logits/rejected": -1.4921875, "eval_logps/chosen": -145.0, "eval_logps/rejected": -135.0, "eval_loss": 0.5950781106948853, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": -0.59765625, "eval_rewards/margins": 0.498046875, "eval_rewards/rejected": -1.1015625, "eval_runtime": 14.1407, "eval_samples_per_second": 14.144, "eval_steps_per_second": 0.495, "step": 468 }, { "epoch": 2.9904153354632586, "step": 468, "total_flos": 0.0, "train_loss": 0.5567819970285791, "train_runtime": 4306.1528, "train_samples_per_second": 6.965, "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }