diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3410 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2038, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 11.418422500469418, + "learning_rate": 2.4509803921568627e-09, + "logits/chosen": -0.4609375, + "logits/rejected": -0.5625, + "logps/chosen": -1832.0, + "logps/rejected": -1832.0, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 10.679791571668222, + "learning_rate": 2.4509803921568626e-08, + "logits/chosen": -0.55859375, + "logits/rejected": -0.58203125, + "logps/chosen": -2784.0, + "logps/rejected": -2624.0, + "loss": 0.6994, + "rewards/accuracies": 0.2888889014720917, + "rewards/chosen": -0.0011749267578125, + "rewards/margins": -0.0142822265625, + "rewards/rejected": 0.01312255859375, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 9.399862723559004, + "learning_rate": 4.901960784313725e-08, + "logits/chosen": -0.62109375, + "logits/rejected": -0.72265625, + "logps/chosen": -2064.0, + "logps/rejected": -1632.0, + "loss": 0.6943, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.01373291015625, + "rewards/margins": 0.012939453125, + "rewards/rejected": 0.00079345703125, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 11.520323557414129, + "learning_rate": 7.352941176470588e-08, + "logits/chosen": -0.61328125, + "logits/rejected": -0.6015625, + "logps/chosen": -1984.0, + "logps/rejected": -1968.0, + "loss": 0.6943, + "rewards/accuracies": 0.36000004410743713, + "rewards/chosen": -0.01300048828125, + "rewards/margins": -0.0107421875, + "rewards/rejected": -0.002197265625, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 11.053039743567291, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.5078125, + "logits/rejected": -0.54296875, + "logps/chosen": -2176.0, + "logps/rejected": -2024.0, + "loss": 0.69, + "rewards/accuracies": 0.36000004410743713, + "rewards/chosen": 0.01409912109375, + "rewards/margins": 0.0230712890625, + "rewards/rejected": -0.009033203125, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 10.828208455861878, + "learning_rate": 1.2254901960784314e-07, + "logits/chosen": -0.6015625, + "logits/rejected": -0.703125, + "logps/chosen": -2080.0, + "logps/rejected": -1624.0, + "loss": 0.6967, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0101318359375, + "rewards/margins": 0.00958251953125, + "rewards/rejected": 0.000507354736328125, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 10.30578886430554, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -0.5625, + "logits/rejected": -0.703125, + "logps/chosen": -2608.0, + "logps/rejected": -1904.0, + "loss": 0.6971, + "rewards/accuracies": 0.40000003576278687, + "rewards/chosen": -0.004302978515625, + "rewards/margins": -0.017578125, + "rewards/rejected": 0.01324462890625, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 11.148448836585857, + "learning_rate": 1.715686274509804e-07, + "logits/chosen": -0.66015625, + "logits/rejected": -0.7109375, + "logps/chosen": -2112.0, + "logps/rejected": -1880.0, + "loss": 0.6893, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.055419921875, + "rewards/margins": 0.03466796875, + "rewards/rejected": 0.020751953125, + "step": 70 + }, + { + "epoch": 0.04, + "grad_norm": 9.100710948464306, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -0.53515625, + "logits/rejected": -0.5625, + "logps/chosen": -2256.0, + "logps/rejected": -2040.0, + "loss": 0.6961, + "rewards/accuracies": 0.41999998688697815, + "rewards/chosen": 0.050537109375, + "rewards/margins": 0.0162353515625, + "rewards/rejected": 0.034423828125, + "step": 80 + }, + { + "epoch": 0.04, + "grad_norm": 11.809225699546186, + "learning_rate": 2.2058823529411763e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.625, + "logps/chosen": -2000.0, + "logps/rejected": -1712.0, + "loss": 0.6807, + "rewards/accuracies": 0.48000001907348633, + "rewards/chosen": 0.0888671875, + "rewards/margins": 0.0196533203125, + "rewards/rejected": 0.0693359375, + "step": 90 + }, + { + "epoch": 0.05, + "grad_norm": 10.00322127291456, + "learning_rate": 2.4509803921568627e-07, + "logits/chosen": -0.5703125, + "logits/rejected": -0.66015625, + "logps/chosen": -2208.0, + "logps/rejected": -1944.0, + "loss": 0.684, + "rewards/accuracies": 0.46000003814697266, + "rewards/chosen": 0.1591796875, + "rewards/margins": 0.0250244140625, + "rewards/rejected": 0.1337890625, + "step": 100 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -0.6640625, + "eval_logits/rejected": -0.72265625, + "eval_logps/chosen": -2352.0, + "eval_logps/rejected": -2048.0, + "eval_loss": 0.6767656207084656, + "eval_rewards/accuracies": 0.449404776096344, + "eval_rewards/chosen": 0.2314453125, + "eval_rewards/margins": 0.04052734375, + "eval_rewards/rejected": 0.1904296875, + "eval_runtime": 90.1206, + "eval_samples_per_second": 22.192, + "eval_steps_per_second": 0.466, + "step": 100 + }, + { + "epoch": 0.05, + "grad_norm": 11.799509248452377, + "learning_rate": 2.6960784313725486e-07, + "logits/chosen": -0.55859375, + "logits/rejected": -0.6640625, + "logps/chosen": -2432.0, + "logps/rejected": -1888.0, + "loss": 0.682, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.255859375, + "rewards/margins": 0.04736328125, + "rewards/rejected": 0.2080078125, + "step": 110 + }, + { + "epoch": 0.06, + "grad_norm": 9.997348094627812, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.59765625, + "logits/rejected": -0.5546875, + "logps/chosen": -2304.0, + "logps/rejected": -2352.0, + "loss": 0.6822, + "rewards/accuracies": 0.46000003814697266, + "rewards/chosen": 0.28515625, + "rewards/margins": -0.00159454345703125, + "rewards/rejected": 0.287109375, + "step": 120 + }, + { + "epoch": 0.06, + "grad_norm": 10.489336936701164, + "learning_rate": 3.1862745098039215e-07, + "logits/chosen": -0.58203125, + "logits/rejected": -0.69140625, + "logps/chosen": -2240.0, + "logps/rejected": -1608.0, + "loss": 0.6706, + "rewards/accuracies": 0.46000003814697266, + "rewards/chosen": 0.34375, + "rewards/margins": 0.103515625, + "rewards/rejected": 0.2412109375, + "step": 130 + }, + { + "epoch": 0.07, + "grad_norm": 8.373655221952864, + "learning_rate": 3.431372549019608e-07, + "logits/chosen": -0.6328125, + "logits/rejected": -0.6640625, + "logps/chosen": -2256.0, + "logps/rejected": -2128.0, + "loss": 0.6786, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.0556640625, + "rewards/rejected": 0.322265625, + "step": 140 + }, + { + "epoch": 0.07, + "grad_norm": 7.530198235177303, + "learning_rate": 3.6764705882352943e-07, + "logits/chosen": -0.6328125, + "logits/rejected": -0.74609375, + "logps/chosen": -2560.0, + "logps/rejected": -2000.0, + "loss": 0.6618, + "rewards/accuracies": 0.5399999618530273, + "rewards/chosen": 0.466796875, + "rewards/margins": 0.1103515625, + "rewards/rejected": 0.35546875, + "step": 150 + }, + { + "epoch": 0.08, + "grad_norm": 9.93615787272239, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.62109375, + "logps/chosen": -2544.0, + "logps/rejected": -2272.0, + "loss": 0.6632, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.4921875, + "rewards/margins": 0.08984375, + "rewards/rejected": 0.40234375, + "step": 160 + }, + { + "epoch": 0.08, + "grad_norm": 8.034753030681635, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -0.59375, + "logits/rejected": -0.703125, + "logps/chosen": -2400.0, + "logps/rejected": -1968.0, + "loss": 0.6664, + "rewards/accuracies": 0.42000001668930054, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.0771484375, + "rewards/rejected": 0.41015625, + "step": 170 + }, + { + "epoch": 0.09, + "grad_norm": 8.810211928299461, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -0.5, + "logits/rejected": -0.625, + "logps/chosen": -3008.0, + "logps/rejected": -2416.0, + "loss": 0.6614, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.7265625, + "rewards/margins": 0.2216796875, + "rewards/rejected": 0.5078125, + "step": 180 + }, + { + "epoch": 0.09, + "grad_norm": 8.455528608933424, + "learning_rate": 4.656862745098039e-07, + "logits/chosen": -0.484375, + "logits/rejected": -0.62109375, + "logps/chosen": -2544.0, + "logps/rejected": -2064.0, + "loss": 0.6596, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.6171875, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.51171875, + "step": 190 + }, + { + "epoch": 0.1, + "grad_norm": 9.256083265710721, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.515625, + "logits/rejected": -0.5625, + "logps/chosen": -2768.0, + "logps/rejected": -2576.0, + "loss": 0.663, + "rewards/accuracies": 0.46000003814697266, + "rewards/chosen": 0.70703125, + "rewards/margins": 0.05224609375, + "rewards/rejected": 0.65625, + "step": 200 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -0.66796875, + "eval_logits/rejected": -0.7265625, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.6565937399864197, + "eval_rewards/accuracies": 0.494047611951828, + "eval_rewards/chosen": 0.59765625, + "eval_rewards/margins": 0.11083984375, + "eval_rewards/rejected": 0.48828125, + "eval_runtime": 89.4808, + "eval_samples_per_second": 22.351, + "eval_steps_per_second": 0.469, + "step": 200 + }, + { + "epoch": 0.1, + "grad_norm": 10.458558020929722, + "learning_rate": 4.999867958705476e-07, + "logits/chosen": -0.58203125, + "logits/rejected": -0.61328125, + "logps/chosen": -2352.0, + "logps/rejected": -2272.0, + "loss": 0.6743, + "rewards/accuracies": 0.47999995946884155, + "rewards/chosen": 0.61328125, + "rewards/margins": 0.0166015625, + "rewards/rejected": 0.59765625, + "step": 210 + }, + { + "epoch": 0.11, + "grad_norm": 10.655404888327219, + "learning_rate": 4.999061090193831e-07, + "logits/chosen": -0.609375, + "logits/rejected": -0.70703125, + "logps/chosen": -2528.0, + "logps/rejected": -2112.0, + "loss": 0.6801, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.6796875, + "rewards/margins": 0.1416015625, + "rewards/rejected": 0.53515625, + "step": 220 + }, + { + "epoch": 0.11, + "grad_norm": 10.56468733064606, + "learning_rate": 4.997520945910046e-07, + "logits/chosen": -0.58203125, + "logits/rejected": -0.7109375, + "logps/chosen": -2368.0, + "logps/rejected": -1848.0, + "loss": 0.674, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.1865234375, + "rewards/rejected": 0.400390625, + "step": 230 + }, + { + "epoch": 0.12, + "grad_norm": 9.35483083586776, + "learning_rate": 4.995247977764035e-07, + "logits/chosen": -0.6875, + "logits/rejected": -0.75, + "logps/chosen": -2192.0, + "logps/rejected": -1968.0, + "loss": 0.6788, + "rewards/accuracies": 0.4399999976158142, + "rewards/chosen": 0.484375, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.423828125, + "step": 240 + }, + { + "epoch": 0.12, + "grad_norm": 9.99023631027908, + "learning_rate": 4.992242852691269e-07, + "logits/chosen": -0.62109375, + "logits/rejected": -0.703125, + "logps/chosen": -2160.0, + "logps/rejected": -1960.0, + "loss": 0.6734, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.5, + "rewards/margins": 0.1396484375, + "rewards/rejected": 0.361328125, + "step": 250 + }, + { + "epoch": 0.13, + "grad_norm": 8.81462332150393, + "learning_rate": 4.988506452457066e-07, + "logits/chosen": -0.5625, + "logits/rejected": -0.6484375, + "logps/chosen": -2272.0, + "logps/rejected": -1952.0, + "loss": 0.6606, + "rewards/accuracies": 0.4800000786781311, + "rewards/chosen": 0.419921875, + "rewards/margins": 0.04931640625, + "rewards/rejected": 0.37109375, + "step": 260 + }, + { + "epoch": 0.13, + "grad_norm": 9.565836577883003, + "learning_rate": 4.984039873397879e-07, + "logits/chosen": -0.5703125, + "logits/rejected": -0.6953125, + "logps/chosen": -2608.0, + "logps/rejected": -2032.0, + "loss": 0.6715, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.1826171875, + "rewards/rejected": 0.42578125, + "step": 270 + }, + { + "epoch": 0.14, + "grad_norm": 7.882899392773696, + "learning_rate": 4.9788444260996e-07, + "logits/chosen": -0.4921875, + "logits/rejected": -0.55859375, + "logps/chosen": -2512.0, + "logps/rejected": -2144.0, + "loss": 0.6488, + "rewards/accuracies": 0.4800000786781311, + "rewards/chosen": 0.578125, + "rewards/margins": 0.1357421875, + "rewards/rejected": 0.443359375, + "step": 280 + }, + { + "epoch": 0.14, + "grad_norm": 8.30641415306888, + "learning_rate": 4.97292163501301e-07, + "logits/chosen": -0.57421875, + "logits/rejected": -0.578125, + "logps/chosen": -2400.0, + "logps/rejected": -2352.0, + "loss": 0.641, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.10107421875, + "rewards/rejected": 0.5390625, + "step": 290 + }, + { + "epoch": 0.15, + "grad_norm": 8.469934208860764, + "learning_rate": 4.96627323800647e-07, + "logits/chosen": -0.427734375, + "logits/rejected": -0.6328125, + "logps/chosen": -3056.0, + "logps/rejected": -2128.0, + "loss": 0.6529, + "rewards/accuracies": 0.6599999666213989, + "rewards/chosen": 0.76953125, + "rewards/margins": 0.265625, + "rewards/rejected": 0.50390625, + "step": 300 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -0.65625, + "eval_logits/rejected": -0.71875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.651296854019165, + "eval_rewards/accuracies": 0.5148809552192688, + "eval_rewards/chosen": 0.625, + "eval_rewards/margins": 0.1279296875, + "eval_rewards/rejected": 0.494140625, + "eval_runtime": 89.9953, + "eval_samples_per_second": 22.223, + "eval_steps_per_second": 0.467, + "step": 300 + }, + { + "epoch": 0.15, + "grad_norm": 9.606536101637007, + "learning_rate": 4.958901185856005e-07, + "logits/chosen": -0.6015625, + "logits/rejected": -0.640625, + "logps/chosen": -2688.0, + "logps/rejected": -2496.0, + "loss": 0.6578, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.69140625, + "rewards/margins": 0.06005859375, + "rewards/rejected": 0.62890625, + "step": 310 + }, + { + "epoch": 0.16, + "grad_norm": 8.306258892059772, + "learning_rate": 4.95080764167289e-07, + "logits/chosen": -0.474609375, + "logits/rejected": -0.4921875, + "logps/chosen": -2400.0, + "logps/rejected": -2336.0, + "loss": 0.6911, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.042724609375, + "rewards/rejected": 0.54296875, + "step": 320 + }, + { + "epoch": 0.16, + "grad_norm": 9.33758707972592, + "learning_rate": 4.941994980268966e-07, + "logits/chosen": -0.640625, + "logits/rejected": -0.65234375, + "logps/chosen": -2008.0, + "logps/rejected": -1832.0, + "loss": 0.6607, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.478515625, + "rewards/margins": 0.11865234375, + "rewards/rejected": 0.361328125, + "step": 330 + }, + { + "epoch": 0.17, + "grad_norm": 8.421773008047236, + "learning_rate": 4.932465787459808e-07, + "logits/chosen": -0.50390625, + "logits/rejected": -0.48828125, + "logps/chosen": -2512.0, + "logps/rejected": -2512.0, + "loss": 0.6597, + "rewards/accuracies": 0.42000001668930054, + "rewards/chosen": 0.56640625, + "rewards/margins": -0.020263671875, + "rewards/rejected": 0.5859375, + "step": 340 + }, + { + "epoch": 0.17, + "grad_norm": 11.142714749304895, + "learning_rate": 4.922222859306005e-07, + "logits/chosen": -0.5, + "logits/rejected": -0.61328125, + "logps/chosen": -2112.0, + "logps/rejected": -1664.0, + "loss": 0.6529, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.46484375, + "rewards/margins": 0.13671875, + "rewards/rejected": 0.328125, + "step": 350 + }, + { + "epoch": 0.18, + "grad_norm": 9.405996440031476, + "learning_rate": 4.911269201292724e-07, + "logits/chosen": -0.478515625, + "logits/rejected": -0.58203125, + "logps/chosen": -2864.0, + "logps/rejected": -2464.0, + "loss": 0.6625, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.796875, + "rewards/margins": 0.2177734375, + "rewards/rejected": 0.578125, + "step": 360 + }, + { + "epoch": 0.18, + "grad_norm": 9.498724790073537, + "learning_rate": 4.899608027447858e-07, + "logits/chosen": -0.515625, + "logits/rejected": -0.64453125, + "logps/chosen": -2672.0, + "logps/rejected": -2112.0, + "loss": 0.659, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.71875, + "rewards/margins": 0.1923828125, + "rewards/rejected": 0.5234375, + "step": 370 + }, + { + "epoch": 0.19, + "grad_norm": 9.511360912470519, + "learning_rate": 4.887242759398945e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.6796875, + "logps/chosen": -2416.0, + "logps/rejected": -2008.0, + "loss": 0.6449, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.2109375, + "rewards/rejected": 0.39453125, + "step": 380 + }, + { + "epoch": 0.19, + "grad_norm": 9.32007338457764, + "learning_rate": 4.874177025369207e-07, + "logits/chosen": -0.46484375, + "logits/rejected": -0.5859375, + "logps/chosen": -2800.0, + "logps/rejected": -2256.0, + "loss": 0.6507, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": 0.7890625, + "rewards/margins": 0.275390625, + "rewards/rejected": 0.51171875, + "step": 390 + }, + { + "epoch": 0.2, + "grad_norm": 9.245573199091506, + "learning_rate": 4.860414659112948e-07, + "logits/chosen": -0.55859375, + "logits/rejected": -0.578125, + "logps/chosen": -2176.0, + "logps/rejected": -1936.0, + "loss": 0.6371, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.1474609375, + "rewards/rejected": 0.48828125, + "step": 400 + }, + { + "epoch": 0.2, + "eval_logits/chosen": -0.66796875, + "eval_logits/rejected": -0.7265625, + "eval_logps/chosen": -2304.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.649093747138977, + "eval_rewards/accuracies": 0.5595238208770752, + "eval_rewards/chosen": 0.65625, + "eval_rewards/margins": 0.15234375, + "eval_rewards/rejected": 0.5, + "eval_runtime": 90.1263, + "eval_samples_per_second": 22.191, + "eval_steps_per_second": 0.466, + "step": 400 + }, + { + "epoch": 0.2, + "grad_norm": 8.878063287442174, + "learning_rate": 4.845959698790652e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.6171875, + "logps/chosen": -2128.0, + "logps/rejected": -1864.0, + "loss": 0.6683, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.0791015625, + "rewards/rejected": 0.515625, + "step": 410 + }, + { + "epoch": 0.21, + "grad_norm": 9.071445660844004, + "learning_rate": 4.830816385784104e-07, + "logits/chosen": -0.5078125, + "logits/rejected": -0.55078125, + "logps/chosen": -2256.0, + "logps/rejected": -1872.0, + "loss": 0.6581, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.189453125, + "rewards/rejected": 0.44140625, + "step": 420 + }, + { + "epoch": 0.21, + "grad_norm": 10.638539511395816, + "learning_rate": 4.814989163451889e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.546875, + "logps/chosen": -1840.0, + "logps/rejected": -1864.0, + "loss": 0.6701, + "rewards/accuracies": 0.5200001001358032, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.03271484375, + "rewards/rejected": 0.4921875, + "step": 430 + }, + { + "epoch": 0.22, + "grad_norm": 9.196165350919857, + "learning_rate": 4.798482675825602e-07, + "logits/chosen": -0.5546875, + "logits/rejected": -0.62109375, + "logps/chosen": -2176.0, + "logps/rejected": -2128.0, + "loss": 0.6605, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.06591796875, + "rewards/rejected": 0.50390625, + "step": 440 + }, + { + "epoch": 0.22, + "grad_norm": 8.671692805246133, + "learning_rate": 4.781301766247215e-07, + "logits/chosen": -0.64453125, + "logits/rejected": -0.640625, + "logps/chosen": -2040.0, + "logps/rejected": -2128.0, + "loss": 0.6486, + "rewards/accuracies": 0.6200000643730164, + "rewards/chosen": 0.494140625, + "rewards/margins": 0.0693359375, + "rewards/rejected": 0.42578125, + "step": 450 + }, + { + "epoch": 0.23, + "grad_norm": 12.453319112805517, + "learning_rate": 4.7634514759479275e-07, + "logits/chosen": -0.6171875, + "logits/rejected": -0.671875, + "logps/chosen": -2096.0, + "logps/rejected": -1800.0, + "loss": 0.6577, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.1748046875, + "rewards/rejected": 0.33203125, + "step": 460 + }, + { + "epoch": 0.23, + "grad_norm": 10.997561643902582, + "learning_rate": 4.7449370425689694e-07, + "logits/chosen": -0.56640625, + "logits/rejected": -0.6171875, + "logps/chosen": -2240.0, + "logps/rejected": -2008.0, + "loss": 0.6317, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.2255859375, + "rewards/rejected": 0.3125, + "step": 470 + }, + { + "epoch": 0.24, + "grad_norm": 7.512807533628497, + "learning_rate": 4.7257638986247684e-07, + "logits/chosen": -0.466796875, + "logits/rejected": -0.6640625, + "logps/chosen": -3024.0, + "logps/rejected": -2192.0, + "loss": 0.6469, + "rewards/accuracies": 0.7199999690055847, + "rewards/chosen": 0.72265625, + "rewards/margins": 0.380859375, + "rewards/rejected": 0.34375, + "step": 480 + }, + { + "epoch": 0.24, + "grad_norm": 13.645526383003268, + "learning_rate": 4.705937669908943e-07, + "logits/chosen": -0.4921875, + "logits/rejected": -0.6015625, + "logps/chosen": -2624.0, + "logps/rejected": -2240.0, + "loss": 0.6384, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.2138671875, + "rewards/rejected": 0.421875, + "step": 490 + }, + { + "epoch": 0.25, + "grad_norm": 7.909000541119995, + "learning_rate": 4.685464173843574e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.6484375, + "logps/chosen": -2192.0, + "logps/rejected": -1792.0, + "loss": 0.6206, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.248046875, + "rewards/rejected": 0.322265625, + "step": 500 + }, + { + "epoch": 0.25, + "eval_logits/chosen": -0.65625, + "eval_logits/rejected": -0.71484375, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6465859413146973, + "eval_rewards/accuracies": 0.5952380895614624, + "eval_rewards/chosen": 0.5390625, + "eval_rewards/margins": 0.14453125, + "eval_rewards/rejected": 0.39453125, + "eval_runtime": 89.3629, + "eval_samples_per_second": 22.381, + "eval_steps_per_second": 0.47, + "step": 500 + }, + { + "epoch": 0.25, + "grad_norm": 9.183157685366133, + "learning_rate": 4.6643494177722574e-07, + "logits/chosen": -0.5546875, + "logits/rejected": -0.58984375, + "logps/chosen": -2160.0, + "logps/rejected": -1864.0, + "loss": 0.6507, + "rewards/accuracies": 0.6200000643730164, + "rewards/chosen": 0.51953125, + "rewards/margins": 0.1279296875, + "rewards/rejected": 0.392578125, + "step": 510 + }, + { + "epoch": 0.26, + "grad_norm": 10.502858947018517, + "learning_rate": 4.6425995971974265e-07, + "logits/chosen": -0.6015625, + "logits/rejected": -0.75, + "logps/chosen": -2448.0, + "logps/rejected": -1856.0, + "loss": 0.6657, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.46875, + "rewards/margins": 0.11279296875, + "rewards/rejected": 0.357421875, + "step": 520 + }, + { + "epoch": 0.26, + "grad_norm": 10.966207624712743, + "learning_rate": 4.6202210939624607e-07, + "logits/chosen": -0.51953125, + "logits/rejected": -0.546875, + "logps/chosen": -2688.0, + "logps/rejected": -2528.0, + "loss": 0.6652, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.64453125, + "rewards/margins": 0.041015625, + "rewards/rejected": 0.60546875, + "step": 530 + }, + { + "epoch": 0.26, + "grad_norm": 8.724395310063244, + "learning_rate": 4.597220474379125e-07, + "logits/chosen": -0.57421875, + "logits/rejected": -0.6484375, + "logps/chosen": -2448.0, + "logps/rejected": -2040.0, + "loss": 0.6592, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.515625, + "rewards/margins": 0.177734375, + "rewards/rejected": 0.333984375, + "step": 540 + }, + { + "epoch": 0.27, + "grad_norm": 8.826948047502222, + "learning_rate": 4.57360448730088e-07, + "logits/chosen": -0.6171875, + "logits/rejected": -0.6953125, + "logps/chosen": -2496.0, + "logps/rejected": -2112.0, + "loss": 0.6483, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.61328125, + "rewards/margins": 0.08837890625, + "rewards/rejected": 0.52734375, + "step": 550 + }, + { + "epoch": 0.27, + "grad_norm": 7.904587345979932, + "learning_rate": 4.549380062142627e-07, + "logits/chosen": -0.66796875, + "logits/rejected": -0.65625, + "logps/chosen": -1960.0, + "logps/rejected": -1928.0, + "loss": 0.6557, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.515625, + "rewards/margins": 0.052001953125, + "rewards/rejected": 0.46484375, + "step": 560 + }, + { + "epoch": 0.28, + "grad_norm": 7.802080225823436, + "learning_rate": 4.524554306847479e-07, + "logits/chosen": -0.6171875, + "logits/rejected": -0.6171875, + "logps/chosen": -2144.0, + "logps/rejected": -2096.0, + "loss": 0.652, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.5625, + "rewards/margins": 0.062255859375, + "rewards/rejected": 0.498046875, + "step": 570 + }, + { + "epoch": 0.28, + "grad_norm": 10.484471048264675, + "learning_rate": 4.499134505801141e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.51953125, + "logps/chosen": -2352.0, + "logps/rejected": -2336.0, + "loss": 0.6459, + "rewards/accuracies": 0.42000001668930054, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.018798828125, + "rewards/rejected": 0.5703125, + "step": 580 + }, + { + "epoch": 0.29, + "grad_norm": 11.582326622430703, + "learning_rate": 4.4731281176945244e-07, + "logits/chosen": -0.5234375, + "logits/rejected": -0.640625, + "logps/chosen": -2768.0, + "logps/rejected": -2208.0, + "loss": 0.6501, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.72265625, + "rewards/margins": 0.1611328125, + "rewards/rejected": 0.55859375, + "step": 590 + }, + { + "epoch": 0.29, + "grad_norm": 8.880977624721396, + "learning_rate": 4.4465427733352124e-07, + "logits/chosen": -0.5078125, + "logits/rejected": -0.55859375, + "logps/chosen": -2320.0, + "logps/rejected": -2080.0, + "loss": 0.686, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.251953125, + "rewards/rejected": 0.4140625, + "step": 600 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -0.66015625, + "eval_logits/rejected": -0.71875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6446093916893005, + "eval_rewards/accuracies": 0.5714285969734192, + "eval_rewards/chosen": 0.578125, + "eval_rewards/margins": 0.1591796875, + "eval_rewards/rejected": 0.41796875, + "eval_runtime": 89.7151, + "eval_samples_per_second": 22.293, + "eval_steps_per_second": 0.468, + "step": 600 + }, + { + "epoch": 0.3, + "grad_norm": 11.188626521589404, + "learning_rate": 4.4193862734084277e-07, + "logits/chosen": -0.6796875, + "logits/rejected": -0.76171875, + "logps/chosen": -2192.0, + "logps/rejected": -2024.0, + "loss": 0.6552, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.1181640625, + "rewards/rejected": 0.453125, + "step": 610 + }, + { + "epoch": 0.3, + "grad_norm": 6.697586616076624, + "learning_rate": 4.391666586188145e-07, + "logits/chosen": -0.640625, + "logits/rejected": -0.703125, + "logps/chosen": -2352.0, + "logps/rejected": -2128.0, + "loss": 0.6495, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.1611328125, + "rewards/rejected": 0.39453125, + "step": 620 + }, + { + "epoch": 0.31, + "grad_norm": 11.772870086073146, + "learning_rate": 4.363391845199045e-07, + "logits/chosen": -0.56640625, + "logits/rejected": -0.59765625, + "logps/chosen": -2432.0, + "logps/rejected": -2272.0, + "loss": 0.625, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.3125, + "step": 630 + }, + { + "epoch": 0.31, + "grad_norm": 8.98000540892343, + "learning_rate": 4.3345703468299634e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.58203125, + "logps/chosen": -2608.0, + "logps/rejected": -2304.0, + "loss": 0.6203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.1865234375, + "rewards/rejected": 0.3828125, + "step": 640 + }, + { + "epoch": 0.32, + "grad_norm": 10.179524289387247, + "learning_rate": 4.3052105478995635e-07, + "logits/chosen": -0.53125, + "logits/rejected": -0.71875, + "logps/chosen": -2448.0, + "logps/rejected": -1608.0, + "loss": 0.6391, + "rewards/accuracies": 0.6800000667572021, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.302734375, + "rewards/rejected": 0.25390625, + "step": 650 + }, + { + "epoch": 0.32, + "grad_norm": 12.855522500267961, + "learning_rate": 4.275321063174936e-07, + "logits/chosen": -0.6015625, + "logits/rejected": -0.640625, + "logps/chosen": -1656.0, + "logps/rejected": -1456.0, + "loss": 0.643, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.09619140625, + "rewards/rejected": 0.298828125, + "step": 660 + }, + { + "epoch": 0.33, + "grad_norm": 7.0575963436949465, + "learning_rate": 4.24491066284384e-07, + "logits/chosen": -0.73046875, + "logits/rejected": -0.734375, + "logps/chosen": -1664.0, + "logps/rejected": -1544.0, + "loss": 0.6358, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.373046875, + "rewards/margins": 0.1484375, + "rewards/rejected": 0.224609375, + "step": 670 + }, + { + "epoch": 0.33, + "grad_norm": 8.261526871746847, + "learning_rate": 4.2139882699413613e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.5625, + "logps/chosen": -2240.0, + "logps/rejected": -2144.0, + "loss": 0.6533, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.169921875, + "rewards/rejected": 0.494140625, + "step": 680 + }, + { + "epoch": 0.34, + "grad_norm": 9.857235722928268, + "learning_rate": 4.1825629577317024e-07, + "logits/chosen": -0.55859375, + "logits/rejected": -0.63671875, + "logps/chosen": -2288.0, + "logps/rejected": -1960.0, + "loss": 0.6869, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.1455078125, + "rewards/rejected": 0.439453125, + "step": 690 + }, + { + "epoch": 0.34, + "grad_norm": 9.152797505715435, + "learning_rate": 4.1506439470459056e-07, + "logits/chosen": -0.51171875, + "logits/rejected": -0.609375, + "logps/chosen": -2496.0, + "logps/rejected": -2208.0, + "loss": 0.6459, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.166015625, + "rewards/rejected": 0.416015625, + "step": 700 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -0.62890625, + "eval_logits/rejected": -0.6875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2032.0, + "eval_loss": 0.6448671817779541, + "eval_rewards/accuracies": 0.601190447807312, + "eval_rewards/chosen": 0.55078125, + "eval_rewards/margins": 0.1884765625, + "eval_rewards/rejected": 0.36328125, + "eval_runtime": 90.547, + "eval_samples_per_second": 22.088, + "eval_steps_per_second": 0.464, + "step": 700 + }, + { + "epoch": 0.35, + "grad_norm": 9.410823784289315, + "learning_rate": 4.1182406035762684e-07, + "logits/chosen": -0.494140625, + "logits/rejected": -0.5234375, + "logps/chosen": -2288.0, + "logps/rejected": -2016.0, + "loss": 0.6429, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.44140625, + "rewards/margins": 0.1240234375, + "rewards/rejected": 0.31640625, + "step": 710 + }, + { + "epoch": 0.35, + "grad_norm": 9.75600415982287, + "learning_rate": 4.085362435128262e-07, + "logits/chosen": -0.5390625, + "logits/rejected": -0.68359375, + "logps/chosen": -2768.0, + "logps/rejected": -2224.0, + "loss": 0.6704, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.2470703125, + "rewards/rejected": 0.38671875, + "step": 720 + }, + { + "epoch": 0.36, + "grad_norm": 7.3325000524472, + "learning_rate": 4.0520190888307413e-07, + "logits/chosen": -0.61328125, + "logits/rejected": -0.66796875, + "logps/chosen": -2544.0, + "logps/rejected": -2304.0, + "loss": 0.647, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.232421875, + "rewards/rejected": 0.40625, + "step": 730 + }, + { + "epoch": 0.36, + "grad_norm": 10.182221180599024, + "learning_rate": 4.0182203483052825e-07, + "logits/chosen": -0.546875, + "logits/rejected": -0.68359375, + "logps/chosen": -2864.0, + "logps/rejected": -2288.0, + "loss": 0.6299, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.7265625, + "rewards/margins": 0.248046875, + "rewards/rejected": 0.4765625, + "step": 740 + }, + { + "epoch": 0.37, + "grad_norm": 7.856122064768362, + "learning_rate": 3.983976130795467e-07, + "logits/chosen": -0.51953125, + "logits/rejected": -0.61328125, + "logps/chosen": -2576.0, + "logps/rejected": -2128.0, + "loss": 0.6215, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.71484375, + "rewards/margins": 0.275390625, + "rewards/rejected": 0.439453125, + "step": 750 + }, + { + "epoch": 0.37, + "grad_norm": 8.363791551838782, + "learning_rate": 3.949296484256959e-07, + "logits/chosen": -0.5859375, + "logits/rejected": -0.64453125, + "logps/chosen": -2128.0, + "logps/rejected": -2008.0, + "loss": 0.6718, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.490234375, + "rewards/margins": 0.0625, + "rewards/rejected": 0.427734375, + "step": 760 + }, + { + "epoch": 0.38, + "grad_norm": 9.779529809569974, + "learning_rate": 3.9141915844092285e-07, + "logits/chosen": -0.546875, + "logits/rejected": -0.6796875, + "logps/chosen": -2208.0, + "logps/rejected": -1912.0, + "loss": 0.6547, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.55078125, + "rewards/margins": 0.23046875, + "rewards/rejected": 0.3203125, + "step": 770 + }, + { + "epoch": 0.38, + "grad_norm": 8.698999015594477, + "learning_rate": 3.8786717317497875e-07, + "logits/chosen": -0.4921875, + "logits/rejected": -0.609375, + "logps/chosen": -2432.0, + "logps/rejected": -2096.0, + "loss": 0.6326, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.6484375, + "rewards/margins": 0.353515625, + "rewards/rejected": 0.296875, + "step": 780 + }, + { + "epoch": 0.39, + "grad_norm": 8.385397043034576, + "learning_rate": 3.842747348531813e-07, + "logits/chosen": -0.53515625, + "logits/rejected": -0.58203125, + "logps/chosen": -2192.0, + "logps/rejected": -1880.0, + "loss": 0.6337, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": 0.578125, + "rewards/margins": 0.23828125, + "rewards/rejected": 0.337890625, + "step": 790 + }, + { + "epoch": 0.39, + "grad_norm": 8.23475490511833, + "learning_rate": 3.806428975706042e-07, + "logits/chosen": -0.62890625, + "logits/rejected": -0.65625, + "logps/chosen": -2352.0, + "logps/rejected": -2112.0, + "loss": 0.6458, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.2119140625, + "rewards/rejected": 0.41796875, + "step": 800 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -0.640625, + "eval_logits/rejected": -0.6953125, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6421015858650208, + "eval_rewards/accuracies": 0.5773809552192688, + "eval_rewards/chosen": 0.55859375, + "eval_rewards/margins": 0.1708984375, + "eval_rewards/rejected": 0.38671875, + "eval_runtime": 86.7441, + "eval_samples_per_second": 23.056, + "eval_steps_per_second": 0.484, + "step": 800 + }, + { + "epoch": 0.4, + "grad_norm": 9.096444731037868, + "learning_rate": 3.769727269827843e-07, + "logits/chosen": -0.51171875, + "logits/rejected": -0.62109375, + "logps/chosen": -1992.0, + "logps/rejected": -1672.0, + "loss": 0.6547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.337890625, + "rewards/margins": 0.0634765625, + "rewards/rejected": 0.275390625, + "step": 810 + }, + { + "epoch": 0.4, + "grad_norm": 9.063205334817118, + "learning_rate": 3.7326529999303633e-07, + "logits/chosen": -0.55859375, + "logits/rejected": -0.6484375, + "logps/chosen": -2608.0, + "logps/rejected": -2128.0, + "loss": 0.6512, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.1396484375, + "rewards/rejected": 0.42578125, + "step": 820 + }, + { + "epoch": 0.41, + "grad_norm": 8.96580135461996, + "learning_rate": 3.6952170443646737e-07, + "logits/chosen": -0.51171875, + "logits/rejected": -0.63671875, + "logps/chosen": -2752.0, + "logps/rejected": -2064.0, + "loss": 0.6351, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.73828125, + "rewards/margins": 0.248046875, + "rewards/rejected": 0.4921875, + "step": 830 + }, + { + "epoch": 0.41, + "grad_norm": 10.61500715901887, + "learning_rate": 3.6574303876078366e-07, + "logits/chosen": -0.5234375, + "logits/rejected": -0.6640625, + "logps/chosen": -2384.0, + "logps/rejected": -1816.0, + "loss": 0.6429, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.734375, + "rewards/margins": 0.30859375, + "rewards/rejected": 0.42578125, + "step": 840 + }, + { + "epoch": 0.42, + "grad_norm": 8.368580291250769, + "learning_rate": 3.619304117039835e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.546875, + "logps/chosen": -2352.0, + "logps/rejected": -2240.0, + "loss": 0.6492, + "rewards/accuracies": 0.440000057220459, + "rewards/chosen": 0.75, + "rewards/margins": 0.12353515625, + "rewards/rejected": 0.625, + "step": 850 + }, + { + "epoch": 0.42, + "grad_norm": 6.994507159815261, + "learning_rate": 3.5808494196903117e-07, + "logits/chosen": -0.55078125, + "logits/rejected": -0.62890625, + "logps/chosen": -2608.0, + "logps/rejected": -2128.0, + "loss": 0.6169, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.87109375, + "rewards/margins": 0.400390625, + "rewards/rejected": 0.47265625, + "step": 860 + }, + { + "epoch": 0.43, + "grad_norm": 8.69645008304575, + "learning_rate": 3.542077578956057e-07, + "logits/chosen": -0.51953125, + "logits/rejected": -0.578125, + "logps/chosen": -2416.0, + "logps/rejected": -2192.0, + "loss": 0.6549, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.7421875, + "rewards/margins": 0.21484375, + "rewards/rejected": 0.52734375, + "step": 870 + }, + { + "epoch": 0.43, + "grad_norm": 6.41242543011292, + "learning_rate": 3.5029999712902387e-07, + "logits/chosen": -0.45703125, + "logits/rejected": -0.53125, + "logps/chosen": -2608.0, + "logps/rejected": -2352.0, + "loss": 0.6401, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.81640625, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.6640625, + "step": 880 + }, + { + "epoch": 0.44, + "grad_norm": 7.750038189045204, + "learning_rate": 3.463628062864312e-07, + "logits/chosen": -0.470703125, + "logits/rejected": -0.58984375, + "logps/chosen": -2688.0, + "logps/rejected": -2128.0, + "loss": 0.622, + "rewards/accuracies": 0.6599999666213989, + "rewards/chosen": 0.8828125, + "rewards/margins": 0.28515625, + "rewards/rejected": 0.59765625, + "step": 890 + }, + { + "epoch": 0.44, + "grad_norm": 7.569717070191467, + "learning_rate": 3.4239734062036067e-07, + "logits/chosen": -0.5, + "logits/rejected": -0.58203125, + "logps/chosen": -2512.0, + "logps/rejected": -2224.0, + "loss": 0.6451, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.625, + "rewards/margins": 0.06591796875, + "rewards/rejected": 0.55859375, + "step": 900 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -0.61328125, + "eval_logits/rejected": -0.671875, + "eval_logps/chosen": -2304.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.6398203372955322, + "eval_rewards/accuracies": 0.5684523582458496, + "eval_rewards/chosen": 0.7109375, + "eval_rewards/margins": 0.20703125, + "eval_rewards/rejected": 0.50390625, + "eval_runtime": 86.5756, + "eval_samples_per_second": 23.101, + "eval_steps_per_second": 0.485, + "step": 900 + }, + { + "epoch": 0.45, + "grad_norm": 8.632582600123046, + "learning_rate": 3.3840476367975874e-07, + "logits/chosen": -0.515625, + "logits/rejected": -0.62890625, + "logps/chosen": -2432.0, + "logps/rejected": -1968.0, + "loss": 0.6196, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.78515625, + "rewards/margins": 0.328125, + "rewards/rejected": 0.45703125, + "step": 910 + }, + { + "epoch": 0.45, + "grad_norm": 8.506506212272358, + "learning_rate": 3.343862469685755e-07, + "logits/chosen": -0.6015625, + "logits/rejected": -0.68359375, + "logps/chosen": -1960.0, + "logps/rejected": -1776.0, + "loss": 0.6652, + "rewards/accuracies": 0.40000003576278687, + "rewards/chosen": 0.53125, + "rewards/margins": 0.045166015625, + "rewards/rejected": 0.486328125, + "step": 920 + }, + { + "epoch": 0.46, + "grad_norm": 11.75407958457082, + "learning_rate": 3.3034296960202195e-07, + "logits/chosen": -0.45703125, + "logits/rejected": -0.5703125, + "logps/chosen": -2656.0, + "logps/rejected": -2192.0, + "loss": 0.6409, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.7734375, + "rewards/margins": 0.232421875, + "rewards/rejected": 0.54296875, + "step": 930 + }, + { + "epoch": 0.46, + "grad_norm": 8.239692706862764, + "learning_rate": 3.2627611796059283e-07, + "logits/chosen": -0.53515625, + "logits/rejected": -0.5625, + "logps/chosen": -2736.0, + "logps/rejected": -2400.0, + "loss": 0.6449, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.91796875, + "rewards/margins": 0.2333984375, + "rewards/rejected": 0.68359375, + "step": 940 + }, + { + "epoch": 0.47, + "grad_norm": 8.160298452954589, + "learning_rate": 3.221868853419587e-07, + "logits/chosen": -0.46875, + "logits/rejected": -0.55078125, + "logps/chosen": -2768.0, + "logps/rejected": -2272.0, + "loss": 0.6157, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.98046875, + "rewards/margins": 0.302734375, + "rewards/rejected": 0.67578125, + "step": 950 + }, + { + "epoch": 0.47, + "grad_norm": 9.590783642097492, + "learning_rate": 3.1807647161082797e-07, + "logits/chosen": -0.47265625, + "logits/rejected": -0.5625, + "logps/chosen": -2656.0, + "logps/rejected": -2336.0, + "loss": 0.6511, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.859375, + "rewards/margins": 0.16796875, + "rewards/rejected": 0.69140625, + "step": 960 + }, + { + "epoch": 0.48, + "grad_norm": 7.99282237541914, + "learning_rate": 3.139460828468815e-07, + "logits/chosen": -0.44140625, + "logits/rejected": -0.494140625, + "logps/chosen": -1976.0, + "logps/rejected": -1776.0, + "loss": 0.6747, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.166015625, + "rewards/rejected": 0.466796875, + "step": 970 + }, + { + "epoch": 0.48, + "grad_norm": 12.319935997541707, + "learning_rate": 3.097969309908847e-07, + "logits/chosen": -0.60546875, + "logits/rejected": -0.5390625, + "logps/chosen": -1728.0, + "logps/rejected": -2024.0, + "loss": 0.6457, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5234375, + "rewards/margins": 0.03857421875, + "rewards/rejected": 0.486328125, + "step": 980 + }, + { + "epoch": 0.49, + "grad_norm": 9.488759901702995, + "learning_rate": 3.056302334890786e-07, + "logits/chosen": -0.53515625, + "logits/rejected": -0.62890625, + "logps/chosen": -2400.0, + "logps/rejected": -2096.0, + "loss": 0.6284, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.83203125, + "rewards/margins": 0.1953125, + "rewards/rejected": 0.63671875, + "step": 990 + }, + { + "epoch": 0.49, + "grad_norm": 7.899477826304579, + "learning_rate": 3.01447212935957e-07, + "logits/chosen": -0.5234375, + "logits/rejected": -0.609375, + "logps/chosen": -2432.0, + "logps/rejected": -2064.0, + "loss": 0.6213, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.8203125, + "rewards/margins": 0.27734375, + "rewards/rejected": 0.54296875, + "step": 1000 + }, + { + "epoch": 0.49, + "eval_logits/chosen": -0.6015625, + "eval_logits/rejected": -0.66015625, + "eval_logps/chosen": -2304.0, + "eval_logps/rejected": -2008.0, + "eval_loss": 0.6406640410423279, + "eval_rewards/accuracies": 0.5714285969734192, + "eval_rewards/chosen": 0.7734375, + "eval_rewards/margins": 0.201171875, + "eval_rewards/rejected": 0.57421875, + "eval_runtime": 86.4608, + "eval_samples_per_second": 23.132, + "eval_steps_per_second": 0.486, + "step": 1000 + }, + { + "epoch": 0.5, + "grad_norm": 8.55209914949281, + "learning_rate": 2.9724909671553134e-07, + "logits/chosen": -0.56640625, + "logits/rejected": -0.6015625, + "logps/chosen": -2040.0, + "logps/rejected": -1952.0, + "loss": 0.6205, + "rewards/accuracies": 0.5399999618530273, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.11328125, + "rewards/rejected": 0.5546875, + "step": 1010 + }, + { + "epoch": 0.5, + "grad_norm": 7.990632543907024, + "learning_rate": 2.930371166411915e-07, + "logits/chosen": -0.458984375, + "logits/rejected": -0.53515625, + "logps/chosen": -2912.0, + "logps/rejected": -2672.0, + "loss": 0.6432, + "rewards/accuracies": 0.6200000643730164, + "rewards/chosen": 0.94921875, + "rewards/margins": 0.173828125, + "rewards/rejected": 0.77734375, + "step": 1020 + }, + { + "epoch": 0.51, + "grad_norm": 8.428930236764653, + "learning_rate": 2.888125085942664e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.578125, + "logps/chosen": -1992.0, + "logps/rejected": -1808.0, + "loss": 0.6492, + "rewards/accuracies": 0.6200000643730164, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.1728515625, + "rewards/rejected": 0.447265625, + "step": 1030 + }, + { + "epoch": 0.51, + "grad_norm": 9.162488652610268, + "learning_rate": 2.845765121613912e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.62109375, + "logps/chosen": -2368.0, + "logps/rejected": -2048.0, + "loss": 0.6406, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.7265625, + "rewards/margins": 0.1865234375, + "rewards/rejected": 0.54296875, + "step": 1040 + }, + { + "epoch": 0.52, + "grad_norm": 7.430844898047478, + "learning_rate": 2.803303702707869e-07, + "logits/chosen": -0.56640625, + "logits/rejected": -0.703125, + "logps/chosen": -2240.0, + "logps/rejected": -1752.0, + "loss": 0.6623, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.578125, + "rewards/margins": 0.224609375, + "rewards/rejected": 0.3515625, + "step": 1050 + }, + { + "epoch": 0.52, + "grad_norm": 10.51154519002261, + "learning_rate": 2.760753288275598e-07, + "logits/chosen": -0.58203125, + "logits/rejected": -0.640625, + "logps/chosen": -2544.0, + "logps/rejected": -2320.0, + "loss": 0.6646, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.74609375, + "rewards/margins": 0.259765625, + "rewards/rejected": 0.484375, + "step": 1060 + }, + { + "epoch": 0.53, + "grad_norm": 7.047353993767992, + "learning_rate": 2.718126363481276e-07, + "logits/chosen": -0.59375, + "logits/rejected": -0.78125, + "logps/chosen": -2720.0, + "logps/rejected": -1832.0, + "loss": 0.6346, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.7109375, + "rewards/margins": 0.341796875, + "rewards/rejected": 0.37109375, + "step": 1070 + }, + { + "epoch": 0.53, + "grad_norm": 8.420763714981168, + "learning_rate": 2.675435435938788e-07, + "logits/chosen": -0.59765625, + "logits/rejected": -0.56640625, + "logps/chosen": -1784.0, + "logps/rejected": -1880.0, + "loss": 0.6347, + "rewards/accuracies": 0.47999995946884155, + "rewards/chosen": 0.39453125, + "rewards/margins": 0.039306640625, + "rewards/rejected": 0.353515625, + "step": 1080 + }, + { + "epoch": 0.53, + "grad_norm": 8.77849576592234, + "learning_rate": 2.63269303204174e-07, + "logits/chosen": -0.48046875, + "logits/rejected": -0.5546875, + "logps/chosen": -2656.0, + "logps/rejected": -2352.0, + "loss": 0.6426, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.275390625, + "rewards/rejected": 0.38671875, + "step": 1090 + }, + { + "epoch": 0.54, + "grad_norm": 8.92354118074767, + "learning_rate": 2.5899116932879534e-07, + "logits/chosen": -0.4765625, + "logits/rejected": -0.5703125, + "logps/chosen": -2496.0, + "logps/rejected": -2144.0, + "loss": 0.6313, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.2353515625, + "rewards/rejected": 0.345703125, + "step": 1100 + }, + { + "epoch": 0.54, + "eval_logits/chosen": -0.609375, + "eval_logits/rejected": -0.66796875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2032.0, + "eval_loss": 0.638671875, + "eval_rewards/accuracies": 0.5892857313156128, + "eval_rewards/chosen": 0.5390625, + "eval_rewards/margins": 0.1806640625, + "eval_rewards/rejected": 0.35546875, + "eval_runtime": 86.566, + "eval_samples_per_second": 23.104, + "eval_steps_per_second": 0.485, + "step": 1100 + }, + { + "epoch": 0.54, + "grad_norm": 8.680098424161624, + "learning_rate": 2.5471039725995345e-07, + "logits/chosen": -0.5078125, + "logits/rejected": -0.5234375, + "logps/chosen": -2224.0, + "logps/rejected": -2096.0, + "loss": 0.6539, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.259765625, + "rewards/rejected": 0.330078125, + "step": 1110 + }, + { + "epoch": 0.55, + "grad_norm": 12.234325062863915, + "learning_rate": 2.504282430639594e-07, + "logits/chosen": -0.466796875, + "logits/rejected": -0.59765625, + "logps/chosen": -2560.0, + "logps/rejected": -2064.0, + "loss": 0.6367, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.56640625, + "rewards/margins": 0.197265625, + "rewards/rejected": 0.3671875, + "step": 1120 + }, + { + "epoch": 0.55, + "grad_norm": 9.71309302276875, + "learning_rate": 2.4614596321266836e-07, + "logits/chosen": -0.60546875, + "logits/rejected": -0.625, + "logps/chosen": -2352.0, + "logps/rejected": -2272.0, + "loss": 0.632, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.45703125, + "rewards/margins": 0.12451171875, + "rewards/rejected": 0.33203125, + "step": 1130 + }, + { + "epoch": 0.56, + "grad_norm": 9.646759761306011, + "learning_rate": 2.418648142148056e-07, + "logits/chosen": -0.50390625, + "logits/rejected": -0.59375, + "logps/chosen": -2784.0, + "logps/rejected": -2192.0, + "loss": 0.6486, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.6640625, + "rewards/margins": 0.21875, + "rewards/rejected": 0.443359375, + "step": 1140 + }, + { + "epoch": 0.56, + "grad_norm": 8.518198214709173, + "learning_rate": 2.375860522472805e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.68359375, + "logps/chosen": -2064.0, + "logps/rejected": -1624.0, + "loss": 0.6414, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.435546875, + "rewards/margins": 0.19140625, + "rewards/rejected": 0.2451171875, + "step": 1150 + }, + { + "epoch": 0.57, + "grad_norm": 9.732180854785232, + "learning_rate": 2.3331093278659906e-07, + "logits/chosen": -0.5625, + "logits/rejected": -0.5859375, + "logps/chosen": -1992.0, + "logps/rejected": -1888.0, + "loss": 0.6358, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.212890625, + "rewards/rejected": 0.322265625, + "step": 1160 + }, + { + "epoch": 0.57, + "grad_norm": 8.25240239846351, + "learning_rate": 2.2904071024048089e-07, + "logits/chosen": -0.5546875, + "logits/rejected": -0.515625, + "logps/chosen": -2016.0, + "logps/rejected": -2016.0, + "loss": 0.6236, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.5625, + "rewards/margins": 0.1025390625, + "rewards/rejected": 0.4609375, + "step": 1170 + }, + { + "epoch": 0.58, + "grad_norm": 9.230235569126055, + "learning_rate": 2.247766375797906e-07, + "logits/chosen": -0.5546875, + "logits/rejected": -0.625, + "logps/chosen": -2224.0, + "logps/rejected": -1864.0, + "loss": 0.6549, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.205078125, + "rewards/rejected": 0.416015625, + "step": 1180 + }, + { + "epoch": 0.58, + "grad_norm": 10.347179580129167, + "learning_rate": 2.2051996597089026e-07, + "logits/chosen": -0.49609375, + "logits/rejected": -0.55078125, + "logps/chosen": -2096.0, + "logps/rejected": -1944.0, + "loss": 0.6314, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.421875, + "rewards/margins": 0.0191650390625, + "rewards/rejected": 0.40234375, + "step": 1190 + }, + { + "epoch": 0.59, + "grad_norm": 7.338817367824801, + "learning_rate": 2.1627194440852142e-07, + "logits/chosen": -0.478515625, + "logits/rejected": -0.59765625, + "logps/chosen": -2592.0, + "logps/rejected": -2064.0, + "loss": 0.6298, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.78515625, + "rewards/margins": 0.265625, + "rewards/rejected": 0.51953125, + "step": 1200 + }, + { + "epoch": 0.59, + "eval_logits/chosen": -0.59765625, + "eval_logits/rejected": -0.65234375, + "eval_logps/chosen": -2304.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.6379843950271606, + "eval_rewards/accuracies": 0.6041666865348816, + "eval_rewards/chosen": 0.6953125, + "eval_rewards/margins": 0.203125, + "eval_rewards/rejected": 0.4921875, + "eval_runtime": 86.5496, + "eval_samples_per_second": 23.108, + "eval_steps_per_second": 0.485, + "step": 1200 + }, + { + "epoch": 0.59, + "grad_norm": 11.049536436218999, + "learning_rate": 2.120338193493248e-07, + "logits/chosen": -0.53515625, + "logits/rejected": -0.59765625, + "logps/chosen": -2624.0, + "logps/rejected": -2144.0, + "loss": 0.6477, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.7421875, + "rewards/margins": 0.2890625, + "rewards/rejected": 0.453125, + "step": 1210 + }, + { + "epoch": 0.6, + "grad_norm": 6.8225110204324615, + "learning_rate": 2.0780683434610413e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.59765625, + "logps/chosen": -2416.0, + "logps/rejected": -2240.0, + "loss": 0.6609, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.734375, + "rewards/margins": 0.2099609375, + "rewards/rejected": 0.52734375, + "step": 1220 + }, + { + "epoch": 0.6, + "grad_norm": 10.18256822065958, + "learning_rate": 2.0359222968294202e-07, + "logits/chosen": -0.5703125, + "logits/rejected": -0.58203125, + "logps/chosen": -2192.0, + "logps/rejected": -2096.0, + "loss": 0.6622, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.5703125, + "rewards/margins": 0.208984375, + "rewards/rejected": 0.359375, + "step": 1230 + }, + { + "epoch": 0.61, + "grad_norm": 8.086963793402472, + "learning_rate": 1.993912420112756e-07, + "logits/chosen": -0.62109375, + "logits/rejected": -0.6015625, + "logps/chosen": -1992.0, + "logps/rejected": -2096.0, + "loss": 0.6603, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.09765625, + "rewards/rejected": 0.53125, + "step": 1240 + }, + { + "epoch": 0.61, + "grad_norm": 8.024306896782054, + "learning_rate": 1.9520510398703766e-07, + "logits/chosen": -0.51953125, + "logits/rejected": -0.5625, + "logps/chosen": -2512.0, + "logps/rejected": -2320.0, + "loss": 0.6632, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.70703125, + "rewards/margins": 0.2353515625, + "rewards/rejected": 0.47265625, + "step": 1250 + }, + { + "epoch": 0.62, + "grad_norm": 9.937034420935973, + "learning_rate": 1.9103504390896944e-07, + "logits/chosen": -0.49609375, + "logits/rejected": -0.5859375, + "logps/chosen": -2464.0, + "logps/rejected": -2240.0, + "loss": 0.6705, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.107421875, + "rewards/rejected": 0.482421875, + "step": 1260 + }, + { + "epoch": 0.62, + "grad_norm": 10.056595402865112, + "learning_rate": 1.8688228535821348e-07, + "logits/chosen": -0.53125, + "logits/rejected": -0.50390625, + "logps/chosen": -1936.0, + "logps/rejected": -2040.0, + "loss": 0.6226, + "rewards/accuracies": 0.440000057220459, + "rewards/chosen": 0.44921875, + "rewards/margins": 0.1337890625, + "rewards/rejected": 0.314453125, + "step": 1270 + }, + { + "epoch": 0.63, + "grad_norm": 8.971832197264595, + "learning_rate": 1.8274804683928913e-07, + "logits/chosen": -0.578125, + "logits/rejected": -0.55859375, + "logps/chosen": -2048.0, + "logps/rejected": -2160.0, + "loss": 0.6517, + "rewards/accuracies": 0.48000001907348633, + "rewards/chosen": 0.498046875, + "rewards/margins": 0.054443359375, + "rewards/rejected": 0.4453125, + "step": 1280 + }, + { + "epoch": 0.63, + "grad_norm": 9.440719762857112, + "learning_rate": 1.786335414225588e-07, + "logits/chosen": -0.5625, + "logits/rejected": -0.625, + "logps/chosen": -2096.0, + "logps/rejected": -1992.0, + "loss": 0.6552, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.400390625, + "rewards/margins": 0.19140625, + "rewards/rejected": 0.208984375, + "step": 1290 + }, + { + "epoch": 0.64, + "grad_norm": 12.661374748922718, + "learning_rate": 1.745399763882881e-07, + "logits/chosen": -0.5, + "logits/rejected": -0.57421875, + "logps/chosen": -2512.0, + "logps/rejected": -2112.0, + "loss": 0.6461, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5390625, + "rewards/margins": 0.19921875, + "rewards/rejected": 0.33984375, + "step": 1300 + }, + { + "epoch": 0.64, + "eval_logits/chosen": -0.63671875, + "eval_logits/rejected": -0.69140625, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2032.0, + "eval_loss": 0.6395859122276306, + "eval_rewards/accuracies": 0.586309552192688, + "eval_rewards/chosen": 0.55859375, + "eval_rewards/margins": 0.1962890625, + "eval_rewards/rejected": 0.361328125, + "eval_runtime": 86.6403, + "eval_samples_per_second": 23.084, + "eval_steps_per_second": 0.485, + "step": 1300 + }, + { + "epoch": 0.64, + "grad_norm": 8.541591527739216, + "learning_rate": 1.704685528724046e-07, + "logits/chosen": -0.56640625, + "logits/rejected": -0.625, + "logps/chosen": -2496.0, + "logps/rejected": -2304.0, + "loss": 0.6451, + "rewards/accuracies": 0.48000001907348633, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.0703125, + "rewards/rejected": 0.55859375, + "step": 1310 + }, + { + "epoch": 0.65, + "grad_norm": 9.240636114996493, + "learning_rate": 1.664204655140607e-07, + "logits/chosen": -0.5625, + "logits/rejected": -0.61328125, + "logps/chosen": -2272.0, + "logps/rejected": -1944.0, + "loss": 0.6175, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.671875, + "rewards/margins": 0.298828125, + "rewards/rejected": 0.375, + "step": 1320 + }, + { + "epoch": 0.65, + "grad_norm": 9.2814996839246, + "learning_rate": 1.6239690210510166e-07, + "logits/chosen": -0.58984375, + "logits/rejected": -0.65625, + "logps/chosen": -2608.0, + "logps/rejected": -2368.0, + "loss": 0.6566, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.5234375, + "step": 1330 + }, + { + "epoch": 0.66, + "grad_norm": 7.5279255603123, + "learning_rate": 1.5839904324154273e-07, + "logits/chosen": -0.34765625, + "logits/rejected": -0.455078125, + "logps/chosen": -2736.0, + "logps/rejected": -2256.0, + "loss": 0.6349, + "rewards/accuracies": 0.5000000596046448, + "rewards/chosen": 0.71875, + "rewards/margins": 0.2255859375, + "rewards/rejected": 0.49609375, + "step": 1340 + }, + { + "epoch": 0.66, + "grad_norm": 11.057065560114015, + "learning_rate": 1.544280619771588e-07, + "logits/chosen": -0.44140625, + "logits/rejected": -0.515625, + "logps/chosen": -2448.0, + "logps/rejected": -2160.0, + "loss": 0.6424, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.7109375, + "rewards/margins": 0.2080078125, + "rewards/rejected": 0.50390625, + "step": 1350 + }, + { + "epoch": 0.67, + "grad_norm": 9.245099893740868, + "learning_rate": 1.5048512347928564e-07, + "logits/chosen": -0.474609375, + "logits/rejected": -0.578125, + "logps/chosen": -2800.0, + "logps/rejected": -2480.0, + "loss": 0.6604, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.74609375, + "rewards/margins": 0.2138671875, + "rewards/rejected": 0.53125, + "step": 1360 + }, + { + "epoch": 0.67, + "grad_norm": 9.054284348259795, + "learning_rate": 1.4657138468693648e-07, + "logits/chosen": -0.57421875, + "logits/rejected": -0.671875, + "logps/chosen": -2224.0, + "logps/rejected": -1832.0, + "loss": 0.6223, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.59765625, + "rewards/margins": 0.33203125, + "rewards/rejected": 0.265625, + "step": 1370 + }, + { + "epoch": 0.68, + "grad_norm": 8.967356728455812, + "learning_rate": 1.426879939713322e-07, + "logits/chosen": -0.58984375, + "logits/rejected": -0.71484375, + "logps/chosen": -2464.0, + "logps/rejected": -1864.0, + "loss": 0.6278, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.76171875, + "rewards/margins": 0.3515625, + "rewards/rejected": 0.408203125, + "step": 1380 + }, + { + "epoch": 0.68, + "grad_norm": 8.462317796783486, + "learning_rate": 1.3883609079894532e-07, + "logits/chosen": -0.52734375, + "logits/rejected": -0.51171875, + "logps/chosen": -1848.0, + "logps/rejected": -1936.0, + "loss": 0.6453, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.5859375, + "rewards/margins": 0.07177734375, + "rewards/rejected": 0.515625, + "step": 1390 + }, + { + "epoch": 0.69, + "grad_norm": 12.275639958330698, + "learning_rate": 1.350168053971577e-07, + "logits/chosen": -0.51953125, + "logits/rejected": -0.69140625, + "logps/chosen": -2528.0, + "logps/rejected": -1856.0, + "loss": 0.6258, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.75, + "rewards/margins": 0.3671875, + "rewards/rejected": 0.3828125, + "step": 1400 + }, + { + "epoch": 0.69, + "eval_logits/chosen": -0.6171875, + "eval_logits/rejected": -0.67578125, + "eval_logps/chosen": -2304.0, + "eval_logps/rejected": -2016.0, + "eval_loss": 0.6359687447547913, + "eval_rewards/accuracies": 0.5922619104385376, + "eval_rewards/chosen": 0.69140625, + "eval_rewards/margins": 0.220703125, + "eval_rewards/rejected": 0.47265625, + "eval_runtime": 86.3381, + "eval_samples_per_second": 23.165, + "eval_steps_per_second": 0.486, + "step": 1400 + }, + { + "epoch": 0.69, + "grad_norm": 10.6189969938929, + "learning_rate": 1.312312584226284e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.5859375, + "logps/chosen": -2368.0, + "logps/rejected": -2144.0, + "loss": 0.6219, + "rewards/accuracies": 0.48000001907348633, + "rewards/chosen": 0.77734375, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.50390625, + "step": 1410 + }, + { + "epoch": 0.7, + "grad_norm": 7.349435818799782, + "learning_rate": 1.2748056063246994e-07, + "logits/chosen": -0.486328125, + "logits/rejected": -0.62890625, + "logps/chosen": -2416.0, + "logps/rejected": -1912.0, + "loss": 0.6426, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.73828125, + "rewards/margins": 0.25, + "rewards/rejected": 0.486328125, + "step": 1420 + }, + { + "epoch": 0.7, + "grad_norm": 9.740153583722698, + "learning_rate": 1.2376581255832966e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.6640625, + "logps/chosen": -2688.0, + "logps/rejected": -2080.0, + "loss": 0.6293, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.78515625, + "rewards/margins": 0.294921875, + "rewards/rejected": 0.48828125, + "step": 1430 + }, + { + "epoch": 0.71, + "grad_norm": 8.753751751983152, + "learning_rate": 1.2008810418347093e-07, + "logits/chosen": -0.5546875, + "logits/rejected": -0.5859375, + "logps/chosen": -2080.0, + "logps/rejected": -1880.0, + "loss": 0.6566, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": 0.57421875, + "rewards/margins": 0.2021484375, + "rewards/rejected": 0.373046875, + "step": 1440 + }, + { + "epoch": 0.71, + "grad_norm": 8.57529838239571, + "learning_rate": 1.1644851462294956e-07, + "logits/chosen": -0.54296875, + "logits/rejected": -0.65234375, + "logps/chosen": -2032.0, + "logps/rejected": -1632.0, + "loss": 0.636, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5078125, + "rewards/margins": 0.1904296875, + "rewards/rejected": 0.318359375, + "step": 1450 + }, + { + "epoch": 0.72, + "grad_norm": 8.301288314698684, + "learning_rate": 1.128481118069799e-07, + "logits/chosen": -0.484375, + "logits/rejected": -0.59375, + "logps/chosen": -2768.0, + "logps/rejected": -2208.0, + "loss": 0.6506, + "rewards/accuracies": 0.7400001287460327, + "rewards/chosen": 0.796875, + "rewards/margins": 0.2421875, + "rewards/rejected": 0.5546875, + "step": 1460 + }, + { + "epoch": 0.72, + "grad_norm": 9.889148738488851, + "learning_rate": 1.0928795216758149e-07, + "logits/chosen": -0.55078125, + "logits/rejected": -0.62890625, + "logps/chosen": -2176.0, + "logps/rejected": -1832.0, + "loss": 0.6503, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.52734375, + "rewards/margins": 0.193359375, + "rewards/rejected": 0.3359375, + "step": 1470 + }, + { + "epoch": 0.73, + "grad_norm": 9.51045156043377, + "learning_rate": 1.0576908032860088e-07, + "logits/chosen": -0.62890625, + "logits/rejected": -0.640625, + "logps/chosen": -2208.0, + "logps/rejected": -2144.0, + "loss": 0.6682, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.66796875, + "rewards/margins": 0.24609375, + "rewards/rejected": 0.421875, + "step": 1480 + }, + { + "epoch": 0.73, + "grad_norm": 7.27263720622099, + "learning_rate": 1.0229252879919714e-07, + "logits/chosen": -0.5859375, + "logits/rejected": -0.60546875, + "logps/chosen": -1960.0, + "logps/rejected": -1784.0, + "loss": 0.6574, + "rewards/accuracies": 0.5399999618530273, + "rewards/chosen": 0.48828125, + "rewards/margins": 0.10107421875, + "rewards/rejected": 0.38671875, + "step": 1490 + }, + { + "epoch": 0.74, + "grad_norm": 9.236088100797234, + "learning_rate": 9.88593176708827e-08, + "logits/chosen": -0.4765625, + "logits/rejected": -0.59765625, + "logps/chosen": -2384.0, + "logps/rejected": -1888.0, + "loss": 0.6347, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": 0.7265625, + "rewards/margins": 0.44921875, + "rewards/rejected": 0.279296875, + "step": 1500 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -0.609375, + "eval_logits/rejected": -0.6640625, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6374765634536743, + "eval_rewards/accuracies": 0.5892857313156128, + "eval_rewards/chosen": 0.625, + "eval_rewards/margins": 0.2099609375, + "eval_rewards/rejected": 0.4140625, + "eval_runtime": 86.4172, + "eval_samples_per_second": 23.144, + "eval_steps_per_second": 0.486, + "step": 1500 + }, + { + "epoch": 0.74, + "grad_norm": 8.84592951975432, + "learning_rate": 9.547045431820749e-08, + "logits/chosen": -0.52734375, + "logits/rejected": -0.5546875, + "logps/chosen": -2384.0, + "logps/rejected": -2240.0, + "loss": 0.6272, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.68359375, + "rewards/margins": 0.11669921875, + "rewards/rejected": 0.56640625, + "step": 1510 + }, + { + "epoch": 0.75, + "grad_norm": 8.84073742236179, + "learning_rate": 9.212693310317479e-08, + "logits/chosen": -0.5703125, + "logits/rejected": -0.58984375, + "logps/chosen": -2176.0, + "logps/rejected": -2048.0, + "loss": 0.6444, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.5625, + "rewards/margins": 0.1689453125, + "rewards/rejected": 0.392578125, + "step": 1520 + }, + { + "epoch": 0.75, + "grad_norm": 6.8580337819405335, + "learning_rate": 8.882973508347449e-08, + "logits/chosen": -0.546875, + "logits/rejected": -0.65234375, + "logps/chosen": -1968.0, + "logps/rejected": -1648.0, + "loss": 0.6548, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.54296875, + "rewards/margins": 0.2451171875, + "rewards/rejected": 0.30078125, + "step": 1530 + }, + { + "epoch": 0.76, + "grad_norm": 10.016816397566908, + "learning_rate": 8.557982772462138e-08, + "logits/chosen": -0.54296875, + "logits/rejected": -0.60546875, + "logps/chosen": -2352.0, + "logps/rejected": -1976.0, + "loss": 0.6418, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.6953125, + "rewards/margins": 0.28125, + "rewards/rejected": 0.4140625, + "step": 1540 + }, + { + "epoch": 0.76, + "grad_norm": 11.142392440409802, + "learning_rate": 8.237816461608049e-08, + "logits/chosen": -0.546875, + "logits/rejected": -0.53125, + "logps/chosen": -2048.0, + "logps/rejected": -1928.0, + "loss": 0.6925, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.515625, + "rewards/margins": 0.1630859375, + "rewards/rejected": 0.3515625, + "step": 1550 + }, + { + "epoch": 0.77, + "grad_norm": 10.436188393180485, + "learning_rate": 7.922568519146425e-08, + "logits/chosen": -0.58984375, + "logits/rejected": -0.62109375, + "logps/chosen": -2432.0, + "logps/rejected": -2240.0, + "loss": 0.662, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.65625, + "rewards/margins": 0.14453125, + "rewards/rejected": 0.515625, + "step": 1560 + }, + { + "epoch": 0.77, + "grad_norm": 8.734002863477851, + "learning_rate": 7.612331445288389e-08, + "logits/chosen": -0.423828125, + "logits/rejected": -0.494140625, + "logps/chosen": -2448.0, + "logps/rejected": -2064.0, + "loss": 0.6432, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.69140625, + "rewards/margins": 0.2001953125, + "rewards/rejected": 0.490234375, + "step": 1570 + }, + { + "epoch": 0.78, + "grad_norm": 10.892280756623634, + "learning_rate": 7.307196269953444e-08, + "logits/chosen": -0.5546875, + "logits/rejected": -0.53515625, + "logps/chosen": -2064.0, + "logps/rejected": -2192.0, + "loss": 0.6439, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.478515625, + "rewards/margins": 0.09716796875, + "rewards/rejected": 0.380859375, + "step": 1580 + }, + { + "epoch": 0.78, + "grad_norm": 11.89094090289422, + "learning_rate": 7.007252526059446e-08, + "logits/chosen": -0.494140625, + "logits/rejected": -0.59375, + "logps/chosen": -2624.0, + "logps/rejected": -2160.0, + "loss": 0.6732, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.62109375, + "rewards/margins": 0.07421875, + "rewards/rejected": 0.546875, + "step": 1590 + }, + { + "epoch": 0.79, + "grad_norm": 8.353720135556827, + "learning_rate": 6.712588223251809e-08, + "logits/chosen": -0.57421875, + "logits/rejected": -0.62890625, + "logps/chosen": -2480.0, + "logps/rejected": -2192.0, + "loss": 0.6185, + "rewards/accuracies": 0.6599999666213989, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.3359375, + "rewards/rejected": 0.32421875, + "step": 1600 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -0.625, + "eval_logits/rejected": -0.6796875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2032.0, + "eval_loss": 0.6382187604904175, + "eval_rewards/accuracies": 0.6041666865348816, + "eval_rewards/chosen": 0.59765625, + "eval_rewards/margins": 0.205078125, + "eval_rewards/rejected": 0.392578125, + "eval_runtime": 86.1215, + "eval_samples_per_second": 23.223, + "eval_steps_per_second": 0.488, + "step": 1600 + }, + { + "epoch": 0.79, + "grad_norm": 9.929495280909341, + "learning_rate": 6.423289822079644e-08, + "logits/chosen": -0.482421875, + "logits/rejected": -0.5234375, + "logps/chosen": -2464.0, + "logps/rejected": -2272.0, + "loss": 0.621, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.73828125, + "rewards/margins": 0.28515625, + "rewards/rejected": 0.453125, + "step": 1610 + }, + { + "epoch": 0.79, + "grad_norm": 7.970603963526829, + "learning_rate": 6.139442208626517e-08, + "logits/chosen": -0.58203125, + "logits/rejected": -0.66015625, + "logps/chosen": -2544.0, + "logps/rejected": -2256.0, + "loss": 0.6499, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.10693359375, + "rewards/rejected": 0.5, + "step": 1620 + }, + { + "epoch": 0.8, + "grad_norm": 7.712927942409274, + "learning_rate": 5.8611286696030795e-08, + "logits/chosen": -0.546875, + "logits/rejected": -0.58203125, + "logps/chosen": -2800.0, + "logps/rejected": -2480.0, + "loss": 0.6347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.81640625, + "rewards/margins": 0.3125, + "rewards/rejected": 0.50390625, + "step": 1630 + }, + { + "epoch": 0.8, + "grad_norm": 8.214503728500627, + "learning_rate": 5.5884308679090525e-08, + "logits/chosen": -0.5703125, + "logits/rejected": -0.60546875, + "logps/chosen": -2208.0, + "logps/rejected": -2112.0, + "loss": 0.6556, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.40625, + "rewards/margins": 0.11474609375, + "rewards/rejected": 0.291015625, + "step": 1640 + }, + { + "epoch": 0.81, + "grad_norm": 8.126286436494889, + "learning_rate": 5.321428818671672e-08, + "logits/chosen": -0.52734375, + "logits/rejected": -0.609375, + "logps/chosen": -2128.0, + "logps/rejected": -1752.0, + "loss": 0.6435, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.443359375, + "rewards/margins": 0.166015625, + "rewards/rejected": 0.27734375, + "step": 1650 + }, + { + "epoch": 0.81, + "grad_norm": 9.727164383599991, + "learning_rate": 5.060200865767605e-08, + "logits/chosen": -0.5546875, + "logits/rejected": -0.6171875, + "logps/chosen": -2336.0, + "logps/rejected": -2040.0, + "loss": 0.627, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.458984375, + "rewards/margins": 0.2470703125, + "rewards/rejected": 0.2119140625, + "step": 1660 + }, + { + "epoch": 0.82, + "grad_norm": 8.55962886651413, + "learning_rate": 4.804823658835233e-08, + "logits/chosen": -0.5859375, + "logits/rejected": -0.671875, + "logps/chosen": -2352.0, + "logps/rejected": -1920.0, + "loss": 0.6377, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.14453125, + "rewards/rejected": 0.369140625, + "step": 1670 + }, + { + "epoch": 0.82, + "grad_norm": 13.3092741964271, + "learning_rate": 4.555372130784102e-08, + "logits/chosen": -0.65625, + "logits/rejected": -0.71875, + "logps/chosen": -1912.0, + "logps/rejected": -1752.0, + "loss": 0.6305, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.376953125, + "rewards/margins": 0.0341796875, + "rewards/rejected": 0.34375, + "step": 1680 + }, + { + "epoch": 0.83, + "grad_norm": 8.172767944806202, + "learning_rate": 4.311919475808037e-08, + "logits/chosen": -0.5546875, + "logits/rejected": -0.609375, + "logps/chosen": -2256.0, + "logps/rejected": -2032.0, + "loss": 0.6572, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.53125, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.3515625, + "step": 1690 + }, + { + "epoch": 0.83, + "grad_norm": 8.68637911065445, + "learning_rate": 4.0745371279084976e-08, + "logits/chosen": -0.5, + "logits/rejected": -0.53515625, + "logps/chosen": -2528.0, + "logps/rejected": -2240.0, + "loss": 0.6408, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.6484375, + "rewards/margins": 0.189453125, + "rewards/rejected": 0.458984375, + "step": 1700 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -0.6171875, + "eval_logits/rejected": -0.671875, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6374297142028809, + "eval_rewards/accuracies": 0.5952380895614624, + "eval_rewards/chosen": 0.59765625, + "eval_rewards/margins": 0.2041015625, + "eval_rewards/rejected": 0.392578125, + "eval_runtime": 86.0796, + "eval_samples_per_second": 23.234, + "eval_steps_per_second": 0.488, + "step": 1700 + }, + { + "epoch": 0.84, + "grad_norm": 12.620733225081068, + "learning_rate": 3.843294739934369e-08, + "logits/chosen": -0.515625, + "logits/rejected": -0.53515625, + "logps/chosen": -2176.0, + "logps/rejected": -2208.0, + "loss": 0.6653, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.142578125, + "rewards/rejected": 0.45703125, + "step": 1710 + }, + { + "epoch": 0.84, + "grad_norm": 11.28123633647817, + "learning_rate": 3.6182601631443596e-08, + "logits/chosen": -0.5, + "logits/rejected": -0.62890625, + "logps/chosen": -2752.0, + "logps/rejected": -2032.0, + "loss": 0.6464, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.671875, + "rewards/margins": 0.1640625, + "rewards/rejected": 0.5078125, + "step": 1720 + }, + { + "epoch": 0.85, + "grad_norm": 9.053407610350268, + "learning_rate": 3.3994994272980944e-08, + "logits/chosen": -0.50390625, + "logits/rejected": -0.6171875, + "logps/chosen": -2384.0, + "logps/rejected": -1832.0, + "loss": 0.6394, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.59375, + "rewards/margins": 0.271484375, + "rewards/rejected": 0.3203125, + "step": 1730 + }, + { + "epoch": 0.85, + "grad_norm": 10.202733795570047, + "learning_rate": 3.187076721281595e-08, + "logits/chosen": -0.55078125, + "logits/rejected": -0.6484375, + "logps/chosen": -2080.0, + "logps/rejected": -1744.0, + "loss": 0.6662, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 0.474609375, + "rewards/margins": 0.1572265625, + "rewards/rejected": 0.31640625, + "step": 1740 + }, + { + "epoch": 0.86, + "grad_norm": 9.310568396714823, + "learning_rate": 2.9810543742729705e-08, + "logits/chosen": -0.5390625, + "logits/rejected": -0.56640625, + "logps/chosen": -2304.0, + "logps/rejected": -2096.0, + "loss": 0.6431, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.6796875, + "rewards/margins": 0.263671875, + "rewards/rejected": 0.4140625, + "step": 1750 + }, + { + "epoch": 0.86, + "grad_norm": 9.254692446674307, + "learning_rate": 2.7814928374537334e-08, + "logits/chosen": -0.48828125, + "logits/rejected": -0.625, + "logps/chosen": -2272.0, + "logps/rejected": -1696.0, + "loss": 0.631, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.51171875, + "rewards/margins": 0.1708984375, + "rewards/rejected": 0.33984375, + "step": 1760 + }, + { + "epoch": 0.87, + "grad_norm": 7.485091349390559, + "learning_rate": 2.5884506662711886e-08, + "logits/chosen": -0.49609375, + "logits/rejected": -0.62109375, + "logps/chosen": -2576.0, + "logps/rejected": -2008.0, + "loss": 0.6258, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.68359375, + "rewards/margins": 0.296875, + "rewards/rejected": 0.388671875, + "step": 1770 + }, + { + "epoch": 0.87, + "grad_norm": 7.583505828929572, + "learning_rate": 2.4019845032570875e-08, + "logits/chosen": -0.4921875, + "logits/rejected": -0.5859375, + "logps/chosen": -2688.0, + "logps/rejected": -2176.0, + "loss": 0.6613, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.1943359375, + "rewards/rejected": 0.435546875, + "step": 1780 + }, + { + "epoch": 0.88, + "grad_norm": 10.11330279692475, + "learning_rate": 2.222149061407527e-08, + "logits/chosen": -0.43359375, + "logits/rejected": -0.5078125, + "logps/chosen": -3072.0, + "logps/rejected": -2704.0, + "loss": 0.6427, + "rewards/accuracies": 0.46000003814697266, + "rewards/chosen": 0.8828125, + "rewards/margins": 0.140625, + "rewards/rejected": 0.7421875, + "step": 1790 + }, + { + "epoch": 0.88, + "grad_norm": 10.263183451230402, + "learning_rate": 2.0489971081290193e-08, + "logits/chosen": -0.5703125, + "logits/rejected": -0.58984375, + "logps/chosen": -2208.0, + "logps/rejected": -1936.0, + "loss": 0.662, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.62890625, + "rewards/margins": 0.11181640625, + "rewards/rejected": 0.515625, + "step": 1800 + }, + { + "epoch": 0.88, + "eval_logits/chosen": -0.62890625, + "eval_logits/rejected": -0.68359375, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6355390548706055, + "eval_rewards/accuracies": 0.601190447807312, + "eval_rewards/chosen": 0.609375, + "eval_rewards/margins": 0.2119140625, + "eval_rewards/rejected": 0.3984375, + "eval_runtime": 86.081, + "eval_samples_per_second": 23.234, + "eval_steps_per_second": 0.488, + "step": 1800 + }, + { + "epoch": 0.89, + "grad_norm": 7.814802788464052, + "learning_rate": 1.882579449755495e-08, + "logits/chosen": -0.515625, + "logits/rejected": -0.640625, + "logps/chosen": -2656.0, + "logps/rejected": -2128.0, + "loss": 0.6186, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.75, + "rewards/margins": 0.29296875, + "rewards/rejected": 0.45703125, + "step": 1810 + }, + { + "epoch": 0.89, + "grad_norm": 9.220542164026453, + "learning_rate": 1.7229449166406477e-08, + "logits/chosen": -0.54296875, + "logits/rejected": -0.58203125, + "logps/chosen": -2496.0, + "logps/rejected": -2256.0, + "loss": 0.658, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.1083984375, + "rewards/rejected": 0.52734375, + "step": 1820 + }, + { + "epoch": 0.9, + "grad_norm": 11.528605527636161, + "learning_rate": 1.5701403488301235e-08, + "logits/chosen": -0.5546875, + "logits/rejected": -0.625, + "logps/chosen": -2288.0, + "logps/rejected": -2000.0, + "loss": 0.6381, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.53125, + "rewards/margins": 0.255859375, + "rewards/rejected": 0.2734375, + "step": 1830 + }, + { + "epoch": 0.9, + "grad_norm": 13.882026366198952, + "learning_rate": 1.4242105823176837e-08, + "logits/chosen": -0.671875, + "logits/rejected": -0.7421875, + "logps/chosen": -2096.0, + "logps/rejected": -1752.0, + "loss": 0.6506, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.515625, + "rewards/margins": 0.19140625, + "rewards/rejected": 0.32421875, + "step": 1840 + }, + { + "epoch": 0.91, + "grad_norm": 9.608713807399866, + "learning_rate": 1.285198435889398e-08, + "logits/chosen": -0.484375, + "logits/rejected": -0.56640625, + "logps/chosen": -2480.0, + "logps/rejected": -1984.0, + "loss": 0.6507, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6015625, + "rewards/margins": 0.2275390625, + "rewards/rejected": 0.373046875, + "step": 1850 + }, + { + "epoch": 0.91, + "grad_norm": 9.195872685886451, + "learning_rate": 1.1531446985597604e-08, + "logits/chosen": -0.671875, + "logits/rejected": -0.5859375, + "logps/chosen": -1664.0, + "logps/rejected": -2064.0, + "loss": 0.6717, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.447265625, + "rewards/margins": -0.01806640625, + "rewards/rejected": 0.46484375, + "step": 1860 + }, + { + "epoch": 0.92, + "grad_norm": 10.06291940471119, + "learning_rate": 1.0280881176033318e-08, + "logits/chosen": -0.55859375, + "logits/rejected": -0.640625, + "logps/chosen": -2400.0, + "logps/rejected": -1952.0, + "loss": 0.6418, + "rewards/accuracies": 0.5600000619888306, + "rewards/chosen": 0.6328125, + "rewards/margins": 0.232421875, + "rewards/rejected": 0.40234375, + "step": 1870 + }, + { + "epoch": 0.92, + "grad_norm": 9.690917680622107, + "learning_rate": 9.100653871854963e-09, + "logits/chosen": -0.55078125, + "logits/rejected": -0.60546875, + "logps/chosen": -2608.0, + "logps/rejected": -2336.0, + "loss": 0.6336, + "rewards/accuracies": 0.5199999809265137, + "rewards/chosen": 0.68359375, + "rewards/margins": 0.31640625, + "rewards/rejected": 0.3671875, + "step": 1880 + }, + { + "epoch": 0.93, + "grad_norm": 7.698287956753982, + "learning_rate": 7.991111375956539e-09, + "logits/chosen": -0.470703125, + "logits/rejected": -0.54296875, + "logps/chosen": -2480.0, + "logps/rejected": -2224.0, + "loss": 0.6679, + "rewards/accuracies": 0.4599999785423279, + "rewards/chosen": 0.60546875, + "rewards/margins": 0.109375, + "rewards/rejected": 0.498046875, + "step": 1890 + }, + { + "epoch": 0.93, + "grad_norm": 9.319385343543805, + "learning_rate": 6.9525792508597634e-09, + "logits/chosen": -0.59765625, + "logits/rejected": -0.6015625, + "logps/chosen": -2336.0, + "logps/rejected": -2320.0, + "loss": 0.6385, + "rewards/accuracies": 0.6599999666213989, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.2236328125, + "rewards/rejected": 0.3671875, + "step": 1900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -0.62109375, + "eval_logits/rejected": -0.67578125, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6378594040870667, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": 0.60546875, + "eval_rewards/margins": 0.212890625, + "eval_rewards/rejected": 0.392578125, + "eval_runtime": 86.1786, + "eval_samples_per_second": 23.208, + "eval_steps_per_second": 0.487, + "step": 1900 + }, + { + "epoch": 0.94, + "grad_norm": 7.544043526165161, + "learning_rate": 5.985362223187296e-09, + "logits/chosen": -0.47265625, + "logits/rejected": -0.54296875, + "logps/chosen": -2512.0, + "logps/rejected": -2176.0, + "loss": 0.6528, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.66015625, + "rewards/margins": 0.21484375, + "rewards/rejected": 0.4453125, + "step": 1910 + }, + { + "epoch": 0.94, + "grad_norm": 9.355498575273248, + "learning_rate": 5.089744094249837e-09, + "logits/chosen": -0.58203125, + "logits/rejected": -0.6953125, + "logps/chosen": -2848.0, + "logps/rejected": -2304.0, + "loss": 0.6129, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.75390625, + "rewards/margins": 0.279296875, + "rewards/rejected": 0.4765625, + "step": 1920 + }, + { + "epoch": 0.95, + "grad_norm": 9.114147510919343, + "learning_rate": 4.265987656772857e-09, + "logits/chosen": -0.55078125, + "logits/rejected": -0.625, + "logps/chosen": -2352.0, + "logps/rejected": -1928.0, + "loss": 0.6488, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.69921875, + "rewards/margins": 0.31640625, + "rewards/rejected": 0.3828125, + "step": 1930 + }, + { + "epoch": 0.95, + "grad_norm": 11.411261082676203, + "learning_rate": 3.5143346177878565e-09, + "logits/chosen": -0.46484375, + "logits/rejected": -0.578125, + "logps/chosen": -2752.0, + "logps/rejected": -2176.0, + "loss": 0.6235, + "rewards/accuracies": 0.7799999713897705, + "rewards/chosen": 0.83984375, + "rewards/margins": 0.482421875, + "rewards/rejected": 0.357421875, + "step": 1940 + }, + { + "epoch": 0.96, + "grad_norm": 7.165814163311654, + "learning_rate": 2.835005527710682e-09, + "logits/chosen": -0.5234375, + "logits/rejected": -0.60546875, + "logps/chosen": -2368.0, + "logps/rejected": -1904.0, + "loss": 0.6375, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 0.58984375, + "rewards/margins": 0.302734375, + "rewards/rejected": 0.2890625, + "step": 1950 + }, + { + "epoch": 0.96, + "grad_norm": 8.854995734298003, + "learning_rate": 2.2281997156273213e-09, + "logits/chosen": -0.4296875, + "logits/rejected": -0.61328125, + "logps/chosen": -2720.0, + "logps/rejected": -1928.0, + "loss": 0.6276, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 0.671875, + "rewards/margins": 0.373046875, + "rewards/rejected": 0.298828125, + "step": 1960 + }, + { + "epoch": 0.97, + "grad_norm": 8.38257510772057, + "learning_rate": 1.6940952308068523e-09, + "logits/chosen": -0.59375, + "logits/rejected": -0.671875, + "logps/chosen": -2576.0, + "logps/rejected": -2160.0, + "loss": 0.6368, + "rewards/accuracies": 0.7600001096725464, + "rewards/chosen": 0.8125, + "rewards/margins": 0.439453125, + "rewards/rejected": 0.373046875, + "step": 1970 + }, + { + "epoch": 0.97, + "grad_norm": 11.857430374835033, + "learning_rate": 1.2328487904580131e-09, + "logits/chosen": -0.61328125, + "logits/rejected": -0.6796875, + "logps/chosen": -2704.0, + "logps/rejected": -2288.0, + "loss": 0.6438, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": 0.63671875, + "rewards/margins": 0.1953125, + "rewards/rejected": 0.443359375, + "step": 1980 + }, + { + "epoch": 0.98, + "grad_norm": 7.7776326201834, + "learning_rate": 8.445957337451515e-10, + "logits/chosen": -0.546875, + "logits/rejected": -0.6328125, + "logps/chosen": -2336.0, + "logps/rejected": -2000.0, + "loss": 0.6336, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 0.6953125, + "rewards/margins": 0.26953125, + "rewards/rejected": 0.423828125, + "step": 1990 + }, + { + "epoch": 0.98, + "grad_norm": 8.452587117057995, + "learning_rate": 5.29449982077046e-10, + "logits/chosen": -0.578125, + "logits/rejected": -0.640625, + "logps/chosen": -2336.0, + "logps/rejected": -2016.0, + "loss": 0.6154, + "rewards/accuracies": 0.6800000667572021, + "rewards/chosen": 0.6171875, + "rewards/margins": 0.2373046875, + "rewards/rejected": 0.37890625, + "step": 2000 + }, + { + "epoch": 0.98, + "eval_logits/chosen": -0.62109375, + "eval_logits/rejected": -0.67578125, + "eval_logps/chosen": -2320.0, + "eval_logps/rejected": -2024.0, + "eval_loss": 0.6380937695503235, + "eval_rewards/accuracies": 0.601190447807312, + "eval_rewards/chosen": 0.609375, + "eval_rewards/margins": 0.2041015625, + "eval_rewards/rejected": 0.404296875, + "eval_runtime": 86.1049, + "eval_samples_per_second": 23.227, + "eval_steps_per_second": 0.488, + "step": 2000 + }, + { + "epoch": 0.99, + "grad_norm": 12.27319369048997, + "learning_rate": 2.875040056799227e-10, + "logits/chosen": -0.5859375, + "logits/rejected": -0.57421875, + "logps/chosen": -2304.0, + "logps/rejected": -2432.0, + "loss": 0.6572, + "rewards/accuracies": 0.5200000405311584, + "rewards/chosen": 0.58203125, + "rewards/margins": 0.08447265625, + "rewards/rejected": 0.498046875, + "step": 2010 + }, + { + "epoch": 0.99, + "grad_norm": 9.006712698632741, + "learning_rate": 1.1882879646485379e-10, + "logits/chosen": -0.54296875, + "logits/rejected": -0.6015625, + "logps/chosen": -2040.0, + "logps/rejected": -1824.0, + "loss": 0.6547, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.53515625, + "rewards/margins": 0.1533203125, + "rewards/rejected": 0.3828125, + "step": 2020 + }, + { + "epoch": 1.0, + "grad_norm": 9.784506721487483, + "learning_rate": 2.3473847197225115e-11, + "logits/chosen": -0.54296875, + "logits/rejected": -0.671875, + "logps/chosen": -2512.0, + "logps/rejected": -1984.0, + "loss": 0.6671, + "rewards/accuracies": 0.5400000810623169, + "rewards/chosen": 0.703125, + "rewards/margins": 0.259765625, + "rewards/rejected": 0.443359375, + "step": 2030 + }, + { + "epoch": 1.0, + "step": 2038, + "total_flos": 0.0, + "train_loss": 0.6502101503246783, + "train_runtime": 8979.6364, + "train_samples_per_second": 6.808, + "train_steps_per_second": 0.227 + } + ], + "logging_steps": 10, + "max_steps": 2038, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}